diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
index 65b2c8f9e..ca5503a96 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -491,23 +491,6 @@ void cuda_integer_div_rem_radix_ciphertext_64(
 void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
                                   int8_t **mem_ptr_void);
 
-uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
-void cuda_integer_compute_prefix_sum_hillis_steele_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks);
-
-void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void);
-
 void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
                                             CudaRadixCiphertextFFI *lwe_array);
 
@@ -781,60 +764,6 @@ void cuda_integer_ilog2_64(
 void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
                                    int8_t **mem_ptr_void);
 
-uint64_t scratch_cuda_compute_equality_selectors_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
-void cuda_compute_equality_selectors_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks,
-    const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks,
-    void *const *ksks);
-
-void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams,
-                                                int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_create_possible_results_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
-void cuda_create_possible_results_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
-    CudaRadixCiphertextFFI const *lwe_array_in_list,
-    uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts,
-    uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks);
-
-void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams,
-                                             int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_aggregate_one_hot_vector_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
-void cuda_aggregate_one_hot_vector_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in_list,
-    uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem,
-    void *const *bsks, void *const *ksks);
-
-void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams,
-                                              int8_t **mem_ptr_void);
-
 uint64_t scratch_cuda_unchecked_match_value_64(
     CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -894,6 +823,185 @@ void cuda_unchecked_match_value_or_64(
 
 void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
                                               int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_contains_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_contains_64(CudaStreamsFFI streams,
+                                CudaRadixCiphertextFFI *output,
+                                CudaRadixCiphertextFFI const *inputs,
+                                CudaRadixCiphertextFFI const *value,
+                                uint32_t num_inputs, uint32_t num_blocks,
+                                int8_t *mem, void *const *bsks,
+                                void *const *ksks);
+
+void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
+                                        int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_contains_clear_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *output,
+                                      CudaRadixCiphertextFFI const *inputs,
+                                      const uint64_t *h_clear_val,
+                                      uint32_t num_inputs, uint32_t num_blocks,
+                                      int8_t *mem, void *const *bsks,
+                                      void *const *ksks);
+
+void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
+                                              int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_is_in_clears_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
+                                    CudaRadixCiphertextFFI *output,
+                                    CudaRadixCiphertextFFI const *input,
+                                    const uint64_t *h_cleartexts,
+                                    uint32_t num_clears, uint32_t num_blocks,
+                                    int8_t *mem, void *const *bsks,
+                                    void *const *ksks);
+
+void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
+                                            int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_compute_final_index_from_selectors_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_compute_final_index_from_selectors_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
+    uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem,
+    void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams,
+                                                        int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_index_in_clears_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
+                                       CudaRadixCiphertextFFI *index_ct,
+                                       CudaRadixCiphertextFFI *match_ct,
+                                       CudaRadixCiphertextFFI const *input,
+                                       const uint64_t *h_cleartexts,
+                                       uint32_t num_clears, uint32_t num_blocks,
+                                       uint32_t num_blocks_index, int8_t *mem,
+                                       void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
+                                               int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_first_index_in_clears_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_first_index_in_clears_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
+    const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
+    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
+    int8_t *mem, void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
+                                                     int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_first_index_of_clear_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_first_index_of_clear_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
+    const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
+    void *const *ksks);
+
+void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
+                                                    int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_first_index_of_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *index_ct,
+                                      CudaRadixCiphertextFFI *match_ct,
+                                      CudaRadixCiphertextFFI const *inputs,
+                                      CudaRadixCiphertextFFI const *value,
+                                      uint32_t num_inputs, uint32_t num_blocks,
+                                      uint32_t num_blocks_index, int8_t *mem,
+                                      void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
+                                              int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_unchecked_index_of_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
+                                CudaRadixCiphertextFFI *index_ct,
+                                CudaRadixCiphertextFFI *match_ct,
+                                CudaRadixCiphertextFFI const *inputs,
+                                CudaRadixCiphertextFFI const *value,
+                                uint32_t num_inputs, uint32_t num_blocks,
+                                uint32_t num_blocks_index, int8_t *mem,
+                                void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
+                                        int8_t **mem_ptr_void);
 } // extern C
 
 #endif // CUDA_INTEGER_H
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
index 2fc65a3c5..55bc89be7 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -18,13 +18,13 @@ template <typename Torus> struct int_equality_selectors_buffer {
   CudaRadixCiphertextFFI *tmp_many_luts_output;
 
   CudaStreams active_streams;
-  std::vector<CudaStreams> sub_streams_vec;
+  CudaStreams *sub_streams;
   cudaEvent_t incoming_event;
-  std::vector<std::vector<cudaEvent_t>> outgoing_events;
+  cudaEvent_t *outgoing_events;
   uint32_t num_streams;
 
-  std::vector<CudaRadixCiphertextFFI *> tmp_block_comparisons_vec;
-  std::vector<int_comparison_buffer<Torus> *> reduction_buffers;
+  CudaRadixCiphertextFFI **tmp_block_comparisons;
+  int_comparison_buffer<Torus> **reduction_buffers;
 
   int_equality_selectors_buffer(CudaStreams streams, int_radix_params params,
                                 uint32_t num_possible_values,
@@ -42,17 +42,18 @@ template <typename Torus> struct int_equality_selectors_buffer {
     this->num_streams = num_streams_to_use;
 
     this->active_streams = streams.active_gpu_subset(num_blocks);
+    uint32_t num_gpus = active_streams.count();
 
     incoming_event = cuda_create_event(streams.gpu_index(0));
 
-    sub_streams_vec.resize(num_streams_to_use);
-    outgoing_events.resize(num_streams_to_use);
+    sub_streams = new CudaStreams[num_streams_to_use];
+    outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
 
     for (uint32_t i = 0; i < num_streams_to_use; i++) {
-      sub_streams_vec[i].create_on_same_gpus(active_streams);
-      outgoing_events[i].resize(active_streams.count());
-      for (uint32_t j = 0; j < active_streams.count(); j++) {
-        outgoing_events[i][j] = cuda_create_event(active_streams.gpu_index(j));
+      sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < num_gpus; j++) {
+        outgoing_events[i * num_gpus + j] =
+            cuda_create_event(active_streams.gpu_index(j));
       }
     }
 
@@ -88,17 +89,19 @@ template <typename Torus> struct int_equality_selectors_buffer {
         params.message_modulus * num_blocks, params.big_lwe_dimension,
         size_tracker, allocate_gpu_memory);
 
-    this->tmp_block_comparisons_vec.resize(this->num_streams);
-    this->reduction_buffers.resize(this->num_streams);
+    this->tmp_block_comparisons =
+        new CudaRadixCiphertextFFI *[this->num_streams];
+    this->reduction_buffers =
+        new int_comparison_buffer<Torus> *[this->num_streams];
     for (uint32_t j = 0; j < this->num_streams; j++) {
-      this->tmp_block_comparisons_vec[j] = new CudaRadixCiphertextFFI;
+      this->tmp_block_comparisons[j] = new CudaRadixCiphertextFFI;
       create_zero_radix_ciphertext_async<Torus>(
           streams.stream(0), streams.gpu_index(0),
-          this->tmp_block_comparisons_vec[j], num_blocks,
-          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+          this->tmp_block_comparisons[j], num_blocks, params.big_lwe_dimension,
+          size_tracker, allocate_gpu_memory);
 
       this->reduction_buffers[j] = new int_comparison_buffer<Torus>(
-          sub_streams_vec[j], COMPARISON_TYPE::EQ, params, num_blocks, false,
+          sub_streams[j], COMPARISON_TYPE::EQ, params, num_blocks, false,
           allocate_gpu_memory, size_tracker);
     }
   }
@@ -112,33 +115,37 @@ template <typename Torus> struct int_equality_selectors_buffer {
                                    this->allocate_gpu_memory);
     delete this->tmp_many_luts_output;
 
-    for (auto ct : this->tmp_block_comparisons_vec) {
+    for (uint32_t i = 0; i < this->num_streams; i++) {
       release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                     ct, this->allocate_gpu_memory);
-      delete ct;
+                                     this->tmp_block_comparisons[i],
+                                     this->allocate_gpu_memory);
+      delete this->tmp_block_comparisons[i];
     }
-    this->tmp_block_comparisons_vec.clear();
+    delete[] this->tmp_block_comparisons;
 
-    for (auto buffer : this->reduction_buffers) {
-      buffer->release(streams);
-      delete buffer;
+    for (uint32_t i = 0; i < this->num_streams; i++) {
+      this->reduction_buffers[i]->release(streams);
+      delete this->reduction_buffers[i];
     }
-    this->reduction_buffers.clear();
+    delete[] this->reduction_buffers;
 
     cuda_event_destroy(incoming_event, streams.gpu_index(0));
-    for (uint j = 0; j < num_streams; j++) {
-      for (uint k = 0; k < active_streams.count(); k++) {
-        cuda_event_destroy(outgoing_events[j][k], active_streams.gpu_index(k));
+
+    uint32_t num_gpus = active_streams.count();
+    for (uint32_t i = 0; i < num_streams; i++) {
+      for (uint32_t j = 0; j < num_gpus; j++) {
+        cuda_event_destroy(outgoing_events[i * num_gpus + j],
+                           active_streams.gpu_index(j));
       }
     }
-    outgoing_events.clear();
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
 
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-
-    for (auto &stream : sub_streams_vec) {
-      stream.release();
-    }
-    sub_streams_vec.clear();
   }
 };
 
@@ -151,15 +158,15 @@ template <typename Torus> struct int_possible_results_buffer {
   uint32_t num_lut_accumulators;
   uint32_t lut_stride;
 
-  std::vector<int_radix_lut<Torus> *> stream_luts_vec;
+  int_radix_lut<Torus> **stream_luts;
 
   CudaStreams active_streams;
-  std::vector<CudaStreams> sub_streams_vec;
+  CudaStreams *sub_streams;
   cudaEvent_t incoming_event;
-  std::vector<std::vector<cudaEvent_t>> outgoing_events;
+  cudaEvent_t *outgoing_events;
   uint32_t num_streams;
 
-  std::vector<CudaRadixCiphertextFFI *> tmp_many_luts_output_vec;
+  CudaRadixCiphertextFFI **tmp_many_luts_output;
 
   int_possible_results_buffer(CudaStreams streams, int_radix_params params,
                               uint32_t num_blocks, uint32_t num_possible_values,
@@ -176,17 +183,18 @@ template <typename Torus> struct int_possible_results_buffer {
     this->num_streams = num_streams_to_use;
 
     this->active_streams = streams.active_gpu_subset(num_blocks);
+    uint32_t num_gpus = active_streams.count();
 
     incoming_event = cuda_create_event(streams.gpu_index(0));
 
-    sub_streams_vec.resize(num_streams_to_use);
-    outgoing_events.resize(num_streams_to_use);
+    sub_streams = new CudaStreams[num_streams_to_use];
+    outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
 
     for (uint32_t i = 0; i < num_streams_to_use; i++) {
-      sub_streams_vec[i].create_on_same_gpus(active_streams);
-      outgoing_events[i].resize(active_streams.count());
-      for (uint32_t j = 0; j < active_streams.count(); j++) {
-        outgoing_events[i][j] = cuda_create_event(active_streams.gpu_index(j));
+      sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < num_gpus; j++) {
+        outgoing_events[i * num_gpus + j] =
+            cuda_create_event(active_streams.gpu_index(j));
       }
     }
 
@@ -207,11 +215,13 @@ template <typename Torus> struct int_possible_results_buffer {
     this->num_lut_accumulators =
         (total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
 
-    stream_luts_vec.reserve(num_streams * num_lut_accumulators);
+    stream_luts =
+        new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
 
     std::vector<std::function<Torus(Torus)>> fns;
     fns.reserve(max_luts_per_call);
 
+    uint32_t lut_count = 0;
     for (uint32_t s = 0; s < num_streams; s++) {
       uint32_t lut_value_start = 0;
 
@@ -221,7 +231,7 @@ template <typename Torus> struct int_possible_results_buffer {
             std::min(max_luts_per_call, total_luts_needed - lut_value_start);
 
         int_radix_lut<Torus> *current_lut = new int_radix_lut<Torus>(
-            sub_streams_vec[s], params, 1, 1, luts_in_this_call,
+            sub_streams[s], params, 1, 1, luts_in_this_call,
             allocate_gpu_memory, size_tracker);
 
         for (uint32_t j = 0; j < luts_in_this_call; j++) {
@@ -236,51 +246,56 @@ template <typename Torus> struct int_possible_results_buffer {
             params.message_modulus, params.carry_modulus, fns,
             allocate_gpu_memory);
 
-        current_lut->broadcast_lut(sub_streams_vec[s].active_gpu_subset(1));
-        stream_luts_vec.push_back(current_lut);
+        current_lut->broadcast_lut(sub_streams[s].active_gpu_subset(1));
+        stream_luts[lut_count++] = current_lut;
         lut_value_start += luts_in_this_call;
       }
     }
     fns.clear();
 
-    this->tmp_many_luts_output_vec.resize(this->num_streams);
+    this->tmp_many_luts_output =
+        new CudaRadixCiphertextFFI *[this->num_streams];
     for (uint32_t j = 0; j < this->num_streams; j++) {
-      this->tmp_many_luts_output_vec[j] = new CudaRadixCiphertextFFI;
+      this->tmp_many_luts_output[j] = new CudaRadixCiphertextFFI;
       create_zero_radix_ciphertext_async<Torus>(
           streams.stream(0), streams.gpu_index(0),
-          this->tmp_many_luts_output_vec[j], max_luts_per_call,
+          this->tmp_many_luts_output[j], max_luts_per_call,
           params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
     }
   }
 
   void release(CudaStreams streams) {
-    for (auto lut : stream_luts_vec) {
-      lut->release(streams);
-      delete lut;
+    for (uint32_t i = 0; i < num_streams * num_lut_accumulators; i++) {
+      stream_luts[i]->release(streams);
+      delete stream_luts[i];
     }
-    stream_luts_vec.clear();
+    delete[] stream_luts;
 
-    for (auto ct : this->tmp_many_luts_output_vec) {
+    for (uint32_t i = 0; i < this->num_streams; i++) {
       release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                     ct, this->allocate_gpu_memory);
-      delete ct;
+                                     this->tmp_many_luts_output[i],
+                                     this->allocate_gpu_memory);
+      delete this->tmp_many_luts_output[i];
     }
-    this->tmp_many_luts_output_vec.clear();
+    delete[] this->tmp_many_luts_output;
 
     cuda_event_destroy(incoming_event, streams.gpu_index(0));
+
+    uint32_t num_gpus = active_streams.count();
     for (uint j = 0; j < num_streams; j++) {
-      for (uint k = 0; k < active_streams.count(); k++) {
-        cuda_event_destroy(outgoing_events[j][k], active_streams.gpu_index(k));
+      for (uint k = 0; k < num_gpus; k++) {
+        cuda_event_destroy(outgoing_events[j * num_gpus + k],
+                           active_streams.gpu_index(k));
       }
     }
-    outgoing_events.clear();
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
 
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-
-    for (auto &stream : sub_streams_vec) {
-      stream.release();
-    }
-    sub_streams_vec.clear();
   }
 };
 
@@ -289,23 +304,23 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
   bool allocate_gpu_memory;
   uint32_t chunk_size;
 
-  std::vector<int_radix_lut<Torus> *> stream_identity_luts;
+  int_radix_lut<Torus> **stream_identity_luts;
   int_radix_lut<Torus> *message_extract_lut;
   int_radix_lut<Torus> *carry_extract_lut;
 
   CudaStreams active_streams;
-  std::vector<CudaStreams> sub_streams_vec;
+  CudaStreams *sub_streams;
   cudaEvent_t incoming_event;
-  std::vector<std::vector<cudaEvent_t>> outgoing_events;
+  cudaEvent_t *outgoing_events;
 
   cudaEvent_t reduction_done_event;
-  std::vector<cudaEvent_t> message_done_events;
-  std::vector<cudaEvent_t> carry_done_events;
+  cudaEvent_t *message_done_events;
+  cudaEvent_t *carry_done_events;
 
   uint32_t num_streams;
 
-  std::vector<CudaRadixCiphertextFFI *> partial_aggregated_vectors;
-  std::vector<CudaRadixCiphertextFFI *> partial_temp_vectors;
+  CudaRadixCiphertextFFI **partial_aggregated_vectors;
+  CudaRadixCiphertextFFI **partial_temp_vectors;
 
   CudaRadixCiphertextFFI *message_ct;
   CudaRadixCiphertextFFI *carry_ct;
@@ -327,37 +342,37 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
     this->num_streams = num_streams_to_use;
 
     this->active_streams = streams.active_gpu_subset(num_blocks);
+    uint32_t num_gpus = active_streams.count();
 
     this->incoming_event = cuda_create_event(streams.gpu_index(0));
     this->reduction_done_event = cuda_create_event(streams.gpu_index(0));
 
-    this->message_done_events.resize(active_streams.count());
-    this->carry_done_events.resize(active_streams.count());
-    for (uint32_t i = 0; i < active_streams.count(); i++) {
+    this->message_done_events = new cudaEvent_t[num_gpus];
+    this->carry_done_events = new cudaEvent_t[num_gpus];
+    for (uint32_t i = 0; i < num_gpus; i++) {
       this->message_done_events[i] =
           cuda_create_event(active_streams.gpu_index(i));
       this->carry_done_events[i] =
           cuda_create_event(active_streams.gpu_index(i));
     }
 
-    this->sub_streams_vec.resize(num_streams);
-    this->outgoing_events.resize(num_streams);
+    this->sub_streams = new CudaStreams[num_streams];
+    this->outgoing_events = new cudaEvent_t[num_streams * num_gpus];
 
     for (uint32_t i = 0; i < num_streams; i++) {
-      this->sub_streams_vec[i].create_on_same_gpus(active_streams);
-      this->outgoing_events[i].resize(active_streams.count());
-      for (uint32_t j = 0; j < active_streams.count(); j++) {
-        this->outgoing_events[i][j] =
+      this->sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < num_gpus; j++) {
+        this->outgoing_events[i * num_gpus + j] =
             cuda_create_event(active_streams.gpu_index(j));
       }
     }
 
-    this->stream_identity_luts.reserve(num_streams);
+    this->stream_identity_luts = new int_radix_lut<Torus> *[num_streams];
     std::function<Torus(Torus)> id_fn = [](Torus x) -> Torus { return x; };
 
     for (uint32_t i = 0; i < num_streams; i++) {
       int_radix_lut<Torus> *lut =
-          new int_radix_lut<Torus>(sub_streams_vec[i], params, 1, num_blocks,
+          new int_radix_lut<Torus>(sub_streams[i], params, 1, num_blocks,
                                    allocate_gpu_memory, size_tracker);
 
       generate_device_accumulator<Torus>(
@@ -366,8 +381,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
           params.polynomial_size, params.message_modulus, params.carry_modulus,
           id_fn, allocate_gpu_memory);
 
-      lut->broadcast_lut(sub_streams_vec[i].active_gpu_subset(num_blocks));
-      this->stream_identity_luts.push_back(lut);
+      lut->broadcast_lut(sub_streams[i].active_gpu_subset(num_blocks));
+      this->stream_identity_luts[i] = lut;
     }
 
     std::function<Torus(Torus)> msg_fn = [params](Torus x) -> Torus {
@@ -378,7 +393,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
     };
 
     this->message_extract_lut =
-        new int_radix_lut<Torus>(sub_streams_vec[0], params, 1, num_blocks,
+        new int_radix_lut<Torus>(sub_streams[0], params, 1, num_blocks,
                                  allocate_gpu_memory, size_tracker);
     generate_device_accumulator<Torus>(
         streams.stream(0), streams.gpu_index(0),
@@ -388,10 +403,10 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
         params.polynomial_size, params.message_modulus, params.carry_modulus,
         msg_fn, allocate_gpu_memory);
     this->message_extract_lut->broadcast_lut(
-        sub_streams_vec[0].active_gpu_subset(num_blocks));
+        sub_streams[0].active_gpu_subset(num_blocks));
 
     this->carry_extract_lut =
-        new int_radix_lut<Torus>(sub_streams_vec[1], params, 1, num_blocks,
+        new int_radix_lut<Torus>(sub_streams[1], params, 1, num_blocks,
                                  allocate_gpu_memory, size_tracker);
     generate_device_accumulator<Torus>(
         streams.stream(0), streams.gpu_index(0),
@@ -401,10 +416,11 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
         params.polynomial_size, params.message_modulus, params.carry_modulus,
         carry_fn, allocate_gpu_memory);
     this->carry_extract_lut->broadcast_lut(
-        sub_streams_vec[1].active_gpu_subset(num_blocks));
+        sub_streams[1].active_gpu_subset(num_blocks));
 
-    this->partial_aggregated_vectors.resize(num_streams);
-    this->partial_temp_vectors.resize(num_streams);
+    this->partial_aggregated_vectors =
+        new CudaRadixCiphertextFFI *[num_streams];
+    this->partial_temp_vectors = new CudaRadixCiphertextFFI *[num_streams];
 
     for (uint32_t i = 0; i < num_streams; i++) {
       this->partial_aggregated_vectors[i] = new CudaRadixCiphertextFFI;
@@ -433,11 +449,11 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
   }
 
   void release(CudaStreams streams) {
-    for (auto lut : stream_identity_luts) {
-      lut->release(streams);
-      delete lut;
+    for (uint32_t i = 0; i < num_streams; i++) {
+      stream_identity_luts[i]->release(streams);
+      delete stream_identity_luts[i];
     }
-    stream_identity_luts.clear();
+    delete[] stream_identity_luts;
 
     this->message_extract_lut->release(streams);
     delete this->message_extract_lut;
@@ -446,17 +462,17 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
 
     for (uint32_t i = 0; i < num_streams; i++) {
       release_radix_ciphertext_async(
-          sub_streams_vec[i].stream(0), sub_streams_vec[i].gpu_index(0),
+          sub_streams[i].stream(0), sub_streams[i].gpu_index(0),
           this->partial_aggregated_vectors[i], this->allocate_gpu_memory);
       delete this->partial_aggregated_vectors[i];
 
       release_radix_ciphertext_async(
-          sub_streams_vec[i].stream(0), sub_streams_vec[i].gpu_index(0),
+          sub_streams[i].stream(0), sub_streams[i].gpu_index(0),
           this->partial_temp_vectors[i], this->allocate_gpu_memory);
       delete this->partial_temp_vectors[i];
     }
-    partial_aggregated_vectors.clear();
-    partial_temp_vectors.clear();
+    delete[] partial_aggregated_vectors;
+    delete[] partial_temp_vectors;
 
     release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                    this->message_ct, this->allocate_gpu_memory);
@@ -467,24 +483,28 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
 
     cuda_event_destroy(incoming_event, streams.gpu_index(0));
     cuda_event_destroy(reduction_done_event, streams.gpu_index(0));
-    for (uint i = 0; i < active_streams.count(); i++) {
+    uint32_t num_gpus = active_streams.count();
+    for (uint i = 0; i < num_gpus; i++) {
       cuda_event_destroy(message_done_events[i], active_streams.gpu_index(i));
       cuda_event_destroy(carry_done_events[i], active_streams.gpu_index(i));
     }
+    delete[] message_done_events;
+    delete[] carry_done_events;
 
     for (uint j = 0; j < num_streams; j++) {
-      for (uint k = 0; k < active_streams.count(); k++) {
-        cuda_event_destroy(outgoing_events[j][k], active_streams.gpu_index(k));
+      for (uint k = 0; k < num_gpus; k++) {
+        cuda_event_destroy(outgoing_events[j * num_gpus + k],
+                           active_streams.gpu_index(k));
       }
     }
-    outgoing_events.clear();
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
 
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-
-    for (auto &stream : sub_streams_vec) {
-      stream.release();
-    }
-    sub_streams_vec.clear();
   }
 };
 
@@ -683,3 +703,1028 @@ template <typename Torus> struct int_unchecked_match_value_or_buffer {
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
   }
 };
+
+template <typename Torus> struct int_unchecked_contains_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_comparison_buffer<Torus> **eq_buffers;
+  int_comparison_buffer<Torus> *reduction_buffer;
+
+  CudaRadixCiphertextFFI *packed_selectors;
+
+  CudaStreams active_streams;
+  CudaStreams *sub_streams;
+  cudaEvent_t incoming_event;
+  cudaEvent_t *outgoing_events;
+  uint32_t num_streams;
+
+  int_unchecked_contains_buffer(CudaStreams streams, int_radix_params params,
+                                uint32_t num_inputs, uint32_t num_blocks,
+                                bool allocate_gpu_memory,
+                                uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    uint32_t num_streams_to_use =
+        std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
+    if (num_streams_to_use == 0)
+      num_streams_to_use = 1;
+
+    this->num_streams = num_streams_to_use;
+    this->active_streams = streams.active_gpu_subset(num_blocks);
+    uint32_t num_gpus = active_streams.count();
+
+    incoming_event = cuda_create_event(streams.gpu_index(0));
+    sub_streams = new CudaStreams[num_streams_to_use];
+    outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
+
+    for (uint32_t i = 0; i < num_streams_to_use; i++) {
+      sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < num_gpus; j++) {
+        outgoing_events[i * num_gpus + j] =
+            cuda_create_event(active_streams.gpu_index(j));
+      }
+    }
+
+    this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
+    for (uint32_t i = 0; i < num_streams; i++) {
+      this->eq_buffers[i] = new int_comparison_buffer<Torus>(
+          sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory,
+          size_tracker);
+    }
+
+    this->reduction_buffer =
+        new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
+                                         allocate_gpu_memory, size_tracker);
+
+    this->packed_selectors = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_selectors,
+        num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams) {
+    for (uint32_t i = 0; i < num_streams; i++) {
+      eq_buffers[i]->release(streams);
+      delete eq_buffers[i];
+    }
+    delete[] eq_buffers;
+
+    this->reduction_buffer->release(streams);
+    delete this->reduction_buffer;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_selectors,
+                                   this->allocate_gpu_memory);
+    delete this->packed_selectors;
+
+    cuda_event_destroy(incoming_event, streams.gpu_index(0));
+
+    uint32_t num_gpus = active_streams.count();
+    for (uint j = 0; j < num_streams; j++) {
+      for (uint k = 0; k < num_gpus; k++) {
+        cuda_event_destroy(outgoing_events[j * num_gpus + k],
+                           active_streams.gpu_index(k));
+      }
+    }
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+template <typename Torus> struct int_unchecked_contains_clear_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_comparison_buffer<Torus> **eq_buffers;
+  int_comparison_buffer<Torus> *reduction_buffer;
+
+  CudaRadixCiphertextFFI *packed_selectors;
+  CudaRadixCiphertextFFI *tmp_clear_val;
+  Torus *d_clear_val;
+
+  CudaStreams active_streams;
+  CudaStreams *sub_streams;
+  cudaEvent_t incoming_event;
+  cudaEvent_t *outgoing_events;
+  uint32_t num_streams;
+
+  int_unchecked_contains_clear_buffer(CudaStreams streams,
+                                      int_radix_params params,
+                                      uint32_t num_inputs, uint32_t num_blocks,
+                                      bool allocate_gpu_memory,
+                                      uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    uint32_t num_streams_to_use =
+        std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
+    if (num_streams_to_use == 0)
+      num_streams_to_use = 1;
+
+    this->num_streams = num_streams_to_use;
+    this->active_streams = streams.active_gpu_subset(num_blocks);
+    uint32_t num_gpus = active_streams.count();
+
+    incoming_event = cuda_create_event(streams.gpu_index(0));
+    sub_streams = new CudaStreams[num_streams_to_use];
+    outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
+
+    for (uint32_t i = 0; i < num_streams_to_use; i++) {
+      sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < num_gpus; j++) {
+        outgoing_events[i * num_gpus + j] =
+            cuda_create_event(active_streams.gpu_index(j));
+      }
+    }
+
+    this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
+    for (uint32_t i = 0; i < num_streams; i++) {
+      this->eq_buffers[i] = new int_comparison_buffer<Torus>(
+          sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory,
+          size_tracker);
+    }
+
+    this->reduction_buffer =
+        new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
+                                         allocate_gpu_memory, size_tracker);
+
+    this->packed_selectors = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_selectors,
+        num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->tmp_clear_val = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->tmp_clear_val,
+        num_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async(
+        num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+        size_tracker, allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams) {
+    for (uint32_t i = 0; i < num_streams; i++) {
+      eq_buffers[i]->release(streams);
+      delete eq_buffers[i];
+    }
+    delete[] eq_buffers;
+
+    this->reduction_buffer->release(streams);
+    delete this->reduction_buffer;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_selectors,
+                                   this->allocate_gpu_memory);
+    delete this->packed_selectors;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->tmp_clear_val,
+                                   this->allocate_gpu_memory);
+    delete this->tmp_clear_val;
+
+    cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0));
+
+    cuda_event_destroy(incoming_event, streams.gpu_index(0));
+
+    uint32_t num_gpus = active_streams.count();
+    for (uint j = 0; j < num_streams; j++) {
+      for (uint k = 0; k < num_gpus; k++) {
+        cuda_event_destroy(outgoing_events[j * num_gpus + k],
+                           active_streams.gpu_index(k));
+      }
+    }
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+template <typename Torus> struct int_unchecked_is_in_clears_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_clears;
+
+  int_equality_selectors_buffer<Torus> *eq_buffer;
+  int_comparison_buffer<Torus> *reduction_buffer;
+
+  CudaRadixCiphertextFFI *packed_selectors;
+  CudaRadixCiphertextFFI *unpacked_selectors;
+
+  int_unchecked_is_in_clears_buffer(CudaStreams streams,
+                                    int_radix_params params,
+                                    uint32_t num_clears, uint32_t num_blocks,
+                                    bool allocate_gpu_memory,
+                                    uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_clears = num_clears;
+
+    this->eq_buffer = new int_equality_selectors_buffer<Torus>(
+        streams, params, num_clears, num_blocks, allocate_gpu_memory,
+        size_tracker);
+
+    this->reduction_buffer =
+        new int_comparison_buffer<Torus>(streams, EQ, params, num_clears, false,
+                                         allocate_gpu_memory, size_tracker);
+
+    this->packed_selectors = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_selectors,
+        num_clears, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->unpacked_selectors = new CudaRadixCiphertextFFI[num_clears];
+
+    for (uint32_t i = 0; i < num_clears; i++) {
+      as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
+                                       this->packed_selectors, i, i + 1);
+    }
+  }
+
+  void release(CudaStreams streams) {
+    this->eq_buffer->release(streams);
+    delete this->eq_buffer;
+
+    this->reduction_buffer->release(streams);
+    delete this->reduction_buffer;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_selectors,
+                                   this->allocate_gpu_memory);
+    delete this->packed_selectors;
+
+    delete[] this->unpacked_selectors;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+template <typename Torus> struct int_final_index_from_selectors_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_possible_results_buffer<Torus> *possible_results_buf;
+  int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
+  int_comparison_buffer<Torus> *reduction_buf;
+
+  CudaRadixCiphertextFFI *packed_selectors;
+  CudaRadixCiphertextFFI *unpacked_selectors;
+  CudaRadixCiphertextFFI *possible_results_ct_list;
+
+  uint64_t *h_indices;
+
+  int_final_index_from_selectors_buffer(CudaStreams streams,
+                                        int_radix_params params,
+                                        uint32_t num_inputs,
+                                        uint32_t num_blocks_index,
+                                        bool allocate_gpu_memory,
+                                        uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+    this->possible_results_buf = new int_possible_results_buffer<Torus>(
+        streams, params, packed_len, num_inputs, allocate_gpu_memory,
+        size_tracker);
+
+    this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
+        streams, params, packed_len, num_inputs, allocate_gpu_memory,
+        size_tracker);
+
+    this->reduction_buf =
+        new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
+                                         allocate_gpu_memory, size_tracker);
+
+    this->packed_selectors = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_selectors,
+        num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs];
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
+                                       this->packed_selectors, i, i + 1);
+    }
+
+    this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs];
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      create_zero_radix_ciphertext_async<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          &this->possible_results_ct_list[i], packed_len,
+          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    }
+
+    uint32_t num_bits_in_message = log2_int(params.message_modulus);
+    uint32_t bits_per_packed_block = 2 * num_bits_in_message;
+
+    h_indices = new uint64_t[num_inputs * packed_len];
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      uint64_t val = i;
+      for (uint32_t b = 0; b < packed_len; b++) {
+        uint64_t mask = (1ULL << bits_per_packed_block) - 1;
+        uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
+        h_indices[i * packed_len + b] = block_val;
+      }
+    }
+  }
+
+  void release(CudaStreams streams) {
+    this->possible_results_buf->release(streams);
+    delete this->possible_results_buf;
+
+    this->aggregate_buf->release(streams);
+    delete this->aggregate_buf;
+
+    this->reduction_buf->release(streams);
+    delete this->reduction_buf;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_selectors,
+                                   this->allocate_gpu_memory);
+    delete this->packed_selectors;
+
+    delete[] this->unpacked_selectors;
+
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                     &this->possible_results_ct_list[i],
+                                     this->allocate_gpu_memory);
+    }
+    delete[] this->possible_results_ct_list;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+
+    delete[] h_indices;
+  }
+};
+
+template <typename Torus> struct int_unchecked_index_in_clears_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_clears;
+
+  int_equality_selectors_buffer<Torus> *eq_selectors_buf;
+  int_final_index_from_selectors_buffer<Torus> *final_index_buf;
+
+  int_unchecked_index_in_clears_buffer(CudaStreams streams,
+                                       int_radix_params params,
+                                       uint32_t num_clears, uint32_t num_blocks,
+                                       uint32_t num_blocks_index,
+                                       bool allocate_gpu_memory,
+                                       uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_clears = num_clears;
+
+    this->eq_selectors_buf = new int_equality_selectors_buffer<Torus>(
+        streams, params, num_clears, num_blocks, allocate_gpu_memory,
+        size_tracker);
+
+    this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
+        streams, params, num_clears, num_blocks_index, allocate_gpu_memory,
+        size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    this->eq_selectors_buf->release(streams);
+    delete this->eq_selectors_buf;
+
+    this->final_index_buf->release(streams);
+    delete this->final_index_buf;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+template <typename Torus> struct int_unchecked_first_index_in_clears_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_unique;
+
+  int_equality_selectors_buffer<Torus> *eq_selectors_buf;
+  int_possible_results_buffer<Torus> *possible_results_buf;
+  int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
+  int_comparison_buffer<Torus> *reduction_buf;
+
+  CudaRadixCiphertextFFI *packed_selectors;
+  CudaRadixCiphertextFFI *unpacked_selectors;
+  CudaRadixCiphertextFFI *possible_results_ct_list;
+
+  int_unchecked_first_index_in_clears_buffer(
+      CudaStreams streams, int_radix_params params, uint32_t num_unique,
+      uint32_t num_blocks, uint32_t num_blocks_index, bool allocate_gpu_memory,
+      uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_unique = num_unique;
+
+    this->eq_selectors_buf = new int_equality_selectors_buffer<Torus>(
+        streams, params, num_unique, num_blocks, allocate_gpu_memory,
+        size_tracker);
+
+    uint32_t packed_len = (num_blocks_index + 1) / 2;
+    this->possible_results_buf = new int_possible_results_buffer<Torus>(
+        streams, params, packed_len, num_unique, allocate_gpu_memory,
+        size_tracker);
+
+    this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
+        streams, params, packed_len, num_unique, allocate_gpu_memory,
+        size_tracker);
+
+    this->reduction_buf =
+        new int_comparison_buffer<Torus>(streams, EQ, params, num_unique, false,
+                                         allocate_gpu_memory, size_tracker);
+
+    this->packed_selectors = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_selectors,
+        num_unique, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->unpacked_selectors = new CudaRadixCiphertextFFI[num_unique];
+    for (uint32_t i = 0; i < num_unique; i++) {
+      as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
+                                       this->packed_selectors, i, i + 1);
+    }
+
+    this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_unique];
+    for (uint32_t i = 0; i < num_unique; i++) {
+      create_zero_radix_ciphertext_async<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          &this->possible_results_ct_list[i], packed_len,
+          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    }
+  }
+
+  void release(CudaStreams streams) {
+    this->eq_selectors_buf->release(streams);
+    delete this->eq_selectors_buf;
+
+    this->possible_results_buf->release(streams);
+    delete this->possible_results_buf;
+
+    this->aggregate_buf->release(streams);
+    delete this->aggregate_buf;
+
+    this->reduction_buf->release(streams);
+    delete this->reduction_buf;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_selectors,
+                                   this->allocate_gpu_memory);
+    delete this->packed_selectors;
+
+    delete[] this->unpacked_selectors;
+
+    for (uint32_t i = 0; i < num_unique; i++) {
+      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                     &this->possible_results_ct_list[i],
+                                     this->allocate_gpu_memory);
+    }
+    delete[] this->possible_results_ct_list;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_comparison_buffer<Torus> **eq_buffers;
+  int_possible_results_buffer<Torus> *possible_results_buf;
+  int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
+  int_comparison_buffer<Torus> *reduction_buf;
+
+  CudaRadixCiphertextFFI *packed_selectors;
+  CudaRadixCiphertextFFI *unpacked_selectors;
+  CudaRadixCiphertextFFI *possible_results_ct_list;
+  CudaRadixCiphertextFFI *tmp_clear_val;
+  Torus *d_clear_val;
+  uint64_t *h_indices;
+
+  int_radix_lut<Torus> *prefix_sum_lut;
+  int_radix_lut<Torus> *cleanup_lut;
+
+  CudaStreams active_streams;
+  CudaStreams *sub_streams;
+  cudaEvent_t incoming_event;
+  cudaEvent_t *outgoing_events;
+  uint32_t num_streams;
+
+  int_unchecked_first_index_of_clear_buffer(
+      CudaStreams streams, int_radix_params params, uint32_t num_inputs,
+      uint32_t num_blocks, uint32_t num_blocks_index, bool allocate_gpu_memory,
+      uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    uint32_t num_streams_to_use =
+        std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
+    if (num_streams_to_use == 0)
+      num_streams_to_use = 1;
+
+    this->num_streams = num_streams_to_use;
+    this->active_streams = streams.active_gpu_subset(num_blocks);
+
+    incoming_event = cuda_create_event(streams.gpu_index(0));
+    sub_streams = new CudaStreams[num_streams_to_use];
+    outgoing_events =
+        new cudaEvent_t[num_streams_to_use * active_streams.count()];
+
+    for (uint32_t i = 0; i < num_streams_to_use; i++) {
+      sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < active_streams.count(); j++) {
+        outgoing_events[i * active_streams.count() + j] =
+            cuda_create_event(active_streams.gpu_index(j));
+      }
+    }
+
+    uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+    this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
+    for (uint32_t i = 0; i < num_streams; i++) {
+      this->eq_buffers[i] = new int_comparison_buffer<Torus>(
+          sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory,
+          size_tracker);
+    }
+
+    this->possible_results_buf = new int_possible_results_buffer<Torus>(
+        streams, params, packed_len, num_inputs, allocate_gpu_memory,
+        size_tracker);
+
+    this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
+        streams, params, packed_len, num_inputs, allocate_gpu_memory,
+        size_tracker);
+
+    this->reduction_buf =
+        new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
+                                         allocate_gpu_memory, size_tracker);
+
+    this->packed_selectors = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_selectors,
+        num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs];
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
+                                       this->packed_selectors, i, i + 1);
+    }
+
+    this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs];
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      create_zero_radix_ciphertext_async<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          &this->possible_results_ct_list[i], packed_len,
+          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    }
+
+    this->tmp_clear_val = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->tmp_clear_val,
+        num_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async(
+        num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
+        size_tracker, allocate_gpu_memory);
+
+    h_indices = nullptr;
+    if (allocate_gpu_memory) {
+      uint32_t num_bits_in_message = log2_int(params.message_modulus);
+      uint32_t bits_per_packed_block = 2 * num_bits_in_message;
+
+      h_indices = new uint64_t[num_inputs * packed_len];
+      for (uint32_t i = 0; i < num_inputs; i++) {
+        uint64_t val = i;
+        for (uint32_t b = 0; b < packed_len; b++) {
+          uint64_t mask = (1ULL << bits_per_packed_block) - 1;
+          uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
+          h_indices[i * packed_len + b] = block_val;
+        }
+      }
+    }
+
+    const Torus ALREADY_SEEN = 2;
+    auto prefix_sum_fn = [ALREADY_SEEN](Torus current,
+                                        Torus previous) -> Torus {
+      if (previous == 1 || previous == ALREADY_SEEN) {
+        return ALREADY_SEEN;
+      }
+      return current;
+    };
+    this->prefix_sum_lut = new int_radix_lut<Torus>(
+        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+
+    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
+      Torus val = x % params.message_modulus;
+      if (val == ALREADY_SEEN)
+        return 0;
+      return val;
+    };
+    this->cleanup_lut = new int_radix_lut<Torus>(
+        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+  }
+
+  void release(CudaStreams streams) {
+    for (uint32_t i = 0; i < num_streams; i++) {
+      eq_buffers[i]->release(streams);
+      delete eq_buffers[i];
+    }
+    delete[] eq_buffers;
+
+    this->possible_results_buf->release(streams);
+    delete this->possible_results_buf;
+
+    this->aggregate_buf->release(streams);
+    delete this->aggregate_buf;
+
+    this->reduction_buf->release(streams);
+    delete this->reduction_buf;
+
+    this->prefix_sum_lut->release(streams);
+    delete this->prefix_sum_lut;
+
+    this->cleanup_lut->release(streams);
+    delete this->cleanup_lut;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_selectors,
+                                   this->allocate_gpu_memory);
+    delete this->packed_selectors;
+
+    delete[] this->unpacked_selectors;
+
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                     &this->possible_results_ct_list[i],
+                                     this->allocate_gpu_memory);
+    }
+    delete[] this->possible_results_ct_list;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->tmp_clear_val,
+                                   this->allocate_gpu_memory);
+    delete this->tmp_clear_val;
+
+    cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0));
+
+    cuda_event_destroy(incoming_event, streams.gpu_index(0));
+
+    uint32_t num_gpus = active_streams.count();
+    for (uint j = 0; j < num_streams; j++) {
+      for (uint k = 0; k < num_gpus; k++) {
+        cuda_event_destroy(outgoing_events[j * num_gpus + k],
+                           active_streams.gpu_index(k));
+      }
+    }
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+
+    delete[] h_indices;
+  }
+};
+
+template <typename Torus> struct int_unchecked_first_index_of_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_comparison_buffer<Torus> **eq_buffers;
+  int_possible_results_buffer<Torus> *possible_results_buf;
+  int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
+  int_comparison_buffer<Torus> *reduction_buf;
+
+  CudaRadixCiphertextFFI *packed_selectors;
+  CudaRadixCiphertextFFI *unpacked_selectors;
+  CudaRadixCiphertextFFI *possible_results_ct_list;
+  uint64_t *h_indices;
+
+  int_radix_lut<Torus> *prefix_sum_lut;
+  int_radix_lut<Torus> *cleanup_lut;
+
+  CudaStreams active_streams;
+  CudaStreams *sub_streams;
+  cudaEvent_t incoming_event;
+  cudaEvent_t *outgoing_events;
+  uint32_t num_streams;
+
+  int_unchecked_first_index_of_buffer(CudaStreams streams,
+                                      int_radix_params params,
+                                      uint32_t num_inputs, uint32_t num_blocks,
+                                      uint32_t num_blocks_index,
+                                      bool allocate_gpu_memory,
+                                      uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    uint32_t num_streams_to_use =
+        std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
+    if (num_streams_to_use == 0)
+      num_streams_to_use = 1;
+
+    this->num_streams = num_streams_to_use;
+    this->active_streams = streams.active_gpu_subset(num_blocks);
+
+    incoming_event = cuda_create_event(streams.gpu_index(0));
+    sub_streams = new CudaStreams[num_streams_to_use];
+    outgoing_events =
+        new cudaEvent_t[num_streams_to_use * active_streams.count()];
+
+    for (uint32_t i = 0; i < num_streams_to_use; i++) {
+      sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < active_streams.count(); j++) {
+        outgoing_events[i * active_streams.count() + j] =
+            cuda_create_event(active_streams.gpu_index(j));
+      }
+    }
+
+    uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+    this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
+    for (uint32_t i = 0; i < num_streams; i++) {
+      this->eq_buffers[i] = new int_comparison_buffer<Torus>(
+          sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory,
+          size_tracker);
+    }
+
+    this->possible_results_buf = new int_possible_results_buffer<Torus>(
+        streams, params, packed_len, num_inputs, allocate_gpu_memory,
+        size_tracker);
+
+    this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
+        streams, params, packed_len, num_inputs, allocate_gpu_memory,
+        size_tracker);
+
+    this->reduction_buf =
+        new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
+                                         allocate_gpu_memory, size_tracker);
+
+    this->packed_selectors = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_selectors,
+        num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs];
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
+                                       this->packed_selectors, i, i + 1);
+    }
+
+    this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs];
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      create_zero_radix_ciphertext_async<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          &this->possible_results_ct_list[i], packed_len,
+          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    }
+
+    h_indices = nullptr;
+    if (allocate_gpu_memory) {
+      uint32_t num_bits_in_message = log2_int(params.message_modulus);
+      uint32_t bits_per_packed_block = 2 * num_bits_in_message;
+
+      h_indices = new uint64_t[num_inputs * packed_len];
+      for (uint32_t i = 0; i < num_inputs; i++) {
+        uint64_t val = i;
+        for (uint32_t b = 0; b < packed_len; b++) {
+          uint64_t mask = (1ULL << bits_per_packed_block) - 1;
+          uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
+          h_indices[i * packed_len + b] = block_val;
+        }
+      }
+    }
+
+    const Torus ALREADY_SEEN = 2;
+    auto prefix_sum_fn = [ALREADY_SEEN](Torus current,
+                                        Torus previous) -> Torus {
+      if (previous == 1 || previous == ALREADY_SEEN) {
+        return ALREADY_SEEN;
+      }
+      return current;
+    };
+    this->prefix_sum_lut = new int_radix_lut<Torus>(
+        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+
+    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
+      Torus val = x % params.message_modulus;
+      if (val == ALREADY_SEEN)
+        return 0;
+      return val;
+    };
+    this->cleanup_lut = new int_radix_lut<Torus>(
+        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+  }
+
+  void release(CudaStreams streams) {
+    for (uint32_t i = 0; i < num_streams; i++) {
+      eq_buffers[i]->release(streams);
+      delete eq_buffers[i];
+    }
+    delete[] eq_buffers;
+
+    this->possible_results_buf->release(streams);
+    delete this->possible_results_buf;
+
+    this->aggregate_buf->release(streams);
+    delete this->aggregate_buf;
+
+    this->reduction_buf->release(streams);
+    delete this->reduction_buf;
+
+    this->prefix_sum_lut->release(streams);
+    delete this->prefix_sum_lut;
+
+    this->cleanup_lut->release(streams);
+    delete this->cleanup_lut;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_selectors,
+                                   this->allocate_gpu_memory);
+    delete this->packed_selectors;
+
+    delete[] this->unpacked_selectors;
+
+    for (uint32_t i = 0; i < num_inputs; i++) {
+      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                     &this->possible_results_ct_list[i],
+                                     this->allocate_gpu_memory);
+    }
+    delete[] this->possible_results_ct_list;
+
+    cuda_event_destroy(incoming_event, streams.gpu_index(0));
+
+    uint32_t num_gpus = active_streams.count();
+    for (uint j = 0; j < num_streams; j++) {
+      for (uint k = 0; k < num_gpus; k++) {
+        cuda_event_destroy(outgoing_events[j * num_gpus + k],
+                           active_streams.gpu_index(k));
+      }
+    }
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+
+    delete[] h_indices;
+  }
+};
+
+template <typename Torus> struct int_unchecked_index_of_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_comparison_buffer<Torus> **eq_buffers;
+  int_final_index_from_selectors_buffer<Torus> *final_index_buf;
+
+  CudaStreams active_streams;
+  CudaStreams *sub_streams;
+  cudaEvent_t incoming_event;
+  cudaEvent_t *outgoing_events;
+  uint32_t num_streams;
+
+  int_unchecked_index_of_buffer(CudaStreams streams, int_radix_params params,
+                                uint32_t num_inputs, uint32_t num_blocks,
+                                uint32_t num_blocks_index,
+                                bool allocate_gpu_memory,
+                                uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    uint32_t num_streams_to_use =
+        std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
+    if (num_streams_to_use == 0)
+      num_streams_to_use = 1;
+
+    this->num_streams = num_streams_to_use;
+    this->active_streams = streams.active_gpu_subset(num_blocks);
+
+    incoming_event = cuda_create_event(streams.gpu_index(0));
+    sub_streams = new CudaStreams[num_streams_to_use];
+    outgoing_events =
+        new cudaEvent_t[num_streams_to_use * active_streams.count()];
+
+    for (uint32_t i = 0; i < num_streams_to_use; i++) {
+      sub_streams[i].create_on_same_gpus(active_streams);
+      for (uint32_t j = 0; j < active_streams.count(); j++) {
+        outgoing_events[i * active_streams.count() + j] =
+            cuda_create_event(active_streams.gpu_index(j));
+      }
+    }
+
+    this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
+    for (uint32_t i = 0; i < num_streams; i++) {
+      this->eq_buffers[i] = new int_comparison_buffer<Torus>(
+          sub_streams[i], EQ, params, num_blocks, false, allocate_gpu_memory,
+          size_tracker);
+    }
+
+    this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
+        streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
+        size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    for (uint32_t i = 0; i < num_streams; i++) {
+      eq_buffers[i]->release(streams);
+      delete eq_buffers[i];
+    }
+    delete[] eq_buffers;
+
+    this->final_index_buf->release(streams);
+    delete this->final_index_buf;
+
+    cuda_event_destroy(incoming_event, streams.gpu_index(0));
+
+    uint32_t num_gpus = active_streams.count();
+    for (uint j = 0; j < num_streams; j++) {
+      for (uint k = 0; k < num_gpus; k++) {
+        cuda_event_destroy(outgoing_events[j * num_gpus + k],
+                           active_streams.gpu_index(k));
+      }
+    }
+    delete[] outgoing_events;
+
+    for (uint32_t i = 0; i < num_streams; i++) {
+      sub_streams[i].release();
+    }
+    delete[] sub_streams;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
index 250311791..3508cabdb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -284,46 +284,6 @@ void cleanup_cuda_apply_bivariate_lut_64(CudaStreamsFFI streams,
   POP_RANGE()
 }
 
-uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
-
-  return scratch_cuda_apply_bivariate_lut<uint64_t>(
-      CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
-      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      lut_degree, allocate_gpu_memory);
-}
-
-void cuda_integer_compute_prefix_sum_hillis_steele_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
-
-  host_compute_prefix_sum_hillis_steele<uint64_t>(
-      CudaStreams(streams), output_radix_lwe, generates_or_propagates,
-      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      num_radix_blocks);
-}
-
-void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr_void) {
-  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(CudaStreams(streams));
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
 void cuda_integer_reverse_blocks_64_inplace(CudaStreamsFFI streams,
                                             CudaRadixCiphertextFFI *lwe_array) {
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu
index f953e5a74..9d74f8981 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu
@@ -1,133 +1,5 @@
 #include "integer/vector_find.cuh"
 
-uint64_t scratch_cuda_compute_equality_selectors_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
-
-  return scratch_cuda_compute_equality_selectors<uint64_t>(
-      CudaStreams(streams), (int_equality_selectors_buffer<uint64_t> **)mem_ptr,
-      params, num_possible_values, num_blocks, allocate_gpu_memory);
-}
-
-void cuda_compute_equality_selectors_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks,
-    const uint64_t *h_decomposed_cleartexts, int8_t *mem, void *const *bsks,
-    void *const *ksks) {
-
-  host_compute_equality_selectors<uint64_t>(
-      CudaStreams(streams), lwe_array_out_list, lwe_array_in, num_blocks,
-      h_decomposed_cleartexts, (int_equality_selectors_buffer<uint64_t> *)mem,
-      bsks, (uint64_t *const *)ksks);
-}
-
-void cleanup_cuda_compute_equality_selectors_64(CudaStreamsFFI streams,
-                                                int8_t **mem_ptr_void) {
-  int_equality_selectors_buffer<uint64_t> *mem_ptr =
-      (int_equality_selectors_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(CudaStreams(streams));
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_cuda_create_possible_results_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_possible_values, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
-
-  return scratch_cuda_create_possible_results<uint64_t>(
-      CudaStreams(streams), (int_possible_results_buffer<uint64_t> **)mem_ptr,
-      params, num_blocks, num_possible_values, allocate_gpu_memory);
-}
-
-void cuda_create_possible_results_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_list,
-    CudaRadixCiphertextFFI const *lwe_array_in_list,
-    uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts,
-    uint32_t num_blocks, int8_t *mem, void *const *bsks, void *const *ksks) {
-
-  host_create_possible_results<uint64_t>(
-      CudaStreams(streams), lwe_array_out_list, lwe_array_in_list,
-      num_possible_values, h_decomposed_cleartexts, num_blocks,
-      (int_possible_results_buffer<uint64_t> *)mem, bsks,
-      (uint64_t *const *)ksks);
-}
-
-void cleanup_cuda_create_possible_results_64(CudaStreamsFFI streams,
-                                             int8_t **mem_ptr_void) {
-  int_possible_results_buffer<uint64_t> *mem_ptr =
-      (int_possible_results_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(CudaStreams(streams));
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_cuda_aggregate_one_hot_vector_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t num_matches, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, noise_reduction_type);
-
-  return scratch_cuda_aggregate_one_hot_vector<uint64_t>(
-      CudaStreams(streams), (int_aggregate_one_hot_buffer<uint64_t> **)mem_ptr,
-      params, num_blocks, num_matches, allocate_gpu_memory);
-}
-
-void cuda_aggregate_one_hot_vector_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in_list,
-    uint32_t num_input_ciphertexts, uint32_t num_blocks, int8_t *mem,
-    void *const *bsks, void *const *ksks) {
-
-  host_aggregate_one_hot_vector<uint64_t>(
-      CudaStreams(streams), lwe_array_out, lwe_array_in_list,
-      num_input_ciphertexts, num_blocks,
-      (int_aggregate_one_hot_buffer<uint64_t> *)mem, bsks,
-      (uint64_t *const *)ksks);
-}
-
-void cleanup_cuda_aggregate_one_hot_vector_64(CudaStreamsFFI streams,
-                                              int8_t **mem_ptr_void) {
-  int_aggregate_one_hot_buffer<uint64_t> *mem_ptr =
-      (int_aggregate_one_hot_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(CudaStreams(streams));
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
 uint64_t scratch_cuda_unchecked_match_value_64(
     CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t big_lwe_dimension,
@@ -221,3 +93,410 @@ void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
   delete mem_ptr;
   *mem_ptr_void = nullptr;
 }
+
+uint64_t scratch_cuda_unchecked_contains_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_contains<uint64_t>(
+      CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
+      params, num_inputs, num_blocks, allocate_gpu_memory);
+}
+
+void cuda_unchecked_contains_64(CudaStreamsFFI streams,
+                                CudaRadixCiphertextFFI *output,
+                                CudaRadixCiphertextFFI const *inputs,
+                                CudaRadixCiphertextFFI const *value,
+                                uint32_t num_inputs, uint32_t num_blocks,
+                                int8_t *mem, void *const *bsks,
+                                void *const *ksks) {
+
+  host_unchecked_contains<uint64_t>(
+      CudaStreams(streams), output, inputs, value, num_inputs, num_blocks,
+      (int_unchecked_contains_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
+                                        int8_t **mem_ptr_void) {
+  int_unchecked_contains_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_contains_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_unchecked_contains_clear_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_contains_clear<uint64_t>(
+      CudaStreams(streams),
+      (int_unchecked_contains_clear_buffer<uint64_t> **)mem_ptr, params,
+      num_inputs, num_blocks, allocate_gpu_memory);
+}
+
+void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *output,
+                                      CudaRadixCiphertextFFI const *inputs,
+                                      const uint64_t *h_clear_val,
+                                      uint32_t num_inputs, uint32_t num_blocks,
+                                      int8_t *mem, void *const *bsks,
+                                      void *const *ksks) {
+
+  host_unchecked_contains_clear<uint64_t>(
+      CudaStreams(streams), output, inputs, h_clear_val, num_inputs, num_blocks,
+      (int_unchecked_contains_clear_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
+                                              int8_t **mem_ptr_void) {
+  int_unchecked_contains_clear_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_contains_clear_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_unchecked_is_in_clears_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_is_in_clears<uint64_t>(
+      CudaStreams(streams),
+      (int_unchecked_is_in_clears_buffer<uint64_t> **)mem_ptr, params,
+      num_clears, num_blocks, allocate_gpu_memory);
+}
+
+void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
+                                    CudaRadixCiphertextFFI *output,
+                                    CudaRadixCiphertextFFI const *input,
+                                    const uint64_t *h_cleartexts,
+                                    uint32_t num_clears, uint32_t num_blocks,
+                                    int8_t *mem, void *const *bsks,
+                                    void *const *ksks) {
+
+  host_unchecked_is_in_clears<uint64_t>(
+      CudaStreams(streams), output, input, h_cleartexts, num_clears, num_blocks,
+      (int_unchecked_is_in_clears_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
+                                            int8_t **mem_ptr_void) {
+  int_unchecked_is_in_clears_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_is_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_compute_final_index_from_selectors_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks_index, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_compute_final_index_from_selectors<uint64_t>(
+      CudaStreams(streams),
+      (int_final_index_from_selectors_buffer<uint64_t> **)mem_ptr, params,
+      num_inputs, num_blocks_index, allocate_gpu_memory);
+}
+
+void cuda_compute_final_index_from_selectors_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
+    uint32_t num_inputs, uint32_t num_blocks_index, int8_t *mem,
+    void *const *bsks, void *const *ksks) {
+
+  host_compute_final_index_from_selectors<uint64_t>(
+      CudaStreams(streams), index_ct, match_ct, selectors, num_inputs,
+      num_blocks_index, (int_final_index_from_selectors_buffer<uint64_t> *)mem,
+      bsks, (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_compute_final_index_from_selectors_64(CudaStreamsFFI streams,
+                                                        int8_t **mem_ptr_void) {
+  int_final_index_from_selectors_buffer<uint64_t> *mem_ptr =
+      (int_final_index_from_selectors_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_unchecked_index_in_clears_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_index_in_clears<uint64_t>(
+      CudaStreams(streams),
+      (int_unchecked_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
+      num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
+}
+
+void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
+                                       CudaRadixCiphertextFFI *index_ct,
+                                       CudaRadixCiphertextFFI *match_ct,
+                                       CudaRadixCiphertextFFI const *input,
+                                       const uint64_t *h_cleartexts,
+                                       uint32_t num_clears, uint32_t num_blocks,
+                                       uint32_t num_blocks_index, int8_t *mem,
+                                       void *const *bsks, void *const *ksks) {
+
+  host_unchecked_index_in_clears<uint64_t>(
+      CudaStreams(streams), index_ct, match_ct, input, h_cleartexts, num_clears,
+      num_blocks, num_blocks_index,
+      (int_unchecked_index_in_clears_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
+                                               int8_t **mem_ptr_void) {
+  int_unchecked_index_in_clears_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_unchecked_first_index_in_clears_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
+      CudaStreams(streams),
+      (int_unchecked_first_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
+      num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
+}
+
+void cuda_unchecked_first_index_in_clears_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
+    const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
+    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
+    int8_t *mem, void *const *bsks, void *const *ksks) {
+
+  host_unchecked_first_index_in_clears<uint64_t>(
+      CudaStreams(streams), index_ct, match_ct, input, h_unique_values,
+      h_unique_indices, num_unique, num_blocks, num_blocks_index,
+      (int_unchecked_first_index_in_clears_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
+                                                     int8_t **mem_ptr_void) {
+  int_unchecked_first_index_in_clears_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_first_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_unchecked_first_index_of_clear_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
+      CudaStreams(streams),
+      (int_unchecked_first_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
+      num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
+}
+
+void cuda_unchecked_first_index_of_clear_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
+    const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
+    void *const *ksks) {
+
+  host_unchecked_first_index_of_clear<uint64_t>(
+      CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val, num_inputs,
+      num_blocks, num_blocks_index,
+      (int_unchecked_first_index_of_clear_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
+                                                    int8_t **mem_ptr_void) {
+  int_unchecked_first_index_of_clear_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_first_index_of_clear_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_unchecked_first_index_of_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_first_index_of<uint64_t>(
+      CudaStreams(streams),
+      (int_unchecked_first_index_of_buffer<uint64_t> **)mem_ptr, params,
+      num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
+}
+
+void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
+                                      CudaRadixCiphertextFFI *index_ct,
+                                      CudaRadixCiphertextFFI *match_ct,
+                                      CudaRadixCiphertextFFI const *inputs,
+                                      CudaRadixCiphertextFFI const *value,
+                                      uint32_t num_inputs, uint32_t num_blocks,
+                                      uint32_t num_blocks_index, int8_t *mem,
+                                      void *const *bsks, void *const *ksks) {
+
+  host_unchecked_first_index_of<uint64_t>(
+      CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
+      num_blocks, num_blocks_index,
+      (int_unchecked_first_index_of_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
+                                              int8_t **mem_ptr_void) {
+  int_unchecked_first_index_of_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_first_index_of_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_cuda_unchecked_index_of_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  return scratch_cuda_unchecked_index_of<uint64_t>(
+      CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
+      params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
+}
+
+void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
+                                CudaRadixCiphertextFFI *index_ct,
+                                CudaRadixCiphertextFFI *match_ct,
+                                CudaRadixCiphertextFFI const *inputs,
+                                CudaRadixCiphertextFFI const *value,
+                                uint32_t num_inputs, uint32_t num_blocks,
+                                uint32_t num_blocks_index, int8_t *mem,
+                                void *const *bsks, void *const *ksks) {
+
+  host_unchecked_index_of<uint64_t>(
+      CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
+      num_blocks, num_blocks_index,
+      (int_unchecked_index_of_buffer<uint64_t> *)mem, bsks,
+      (uint64_t *const *)ksks);
+}
+
+void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
+                                        int8_t **mem_ptr_void) {
+  int_unchecked_index_of_buffer<uint64_t> *mem_ptr =
+      (int_unchecked_index_of_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(CudaStreams(streams));
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
index ea52d1788..dbf00a593 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
@@ -28,22 +28,23 @@ __host__ void host_compute_equality_selectors(
 
   for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
     for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
-      cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i),
+      cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
                              mem_ptr->incoming_event,
-                             mem_ptr->sub_streams_vec[j].gpu_index(i));
+                             mem_ptr->sub_streams[j].gpu_index(i));
     }
   }
 
   uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
 
   for (uint32_t i = 0; i < num_possible_values; i++) {
 
     uint32_t stream_idx = i % num_streams;
 
-    CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx];
+    CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
 
     CudaRadixCiphertextFFI *current_tmp_block_comparisons =
-        mem_ptr->tmp_block_comparisons_vec[stream_idx];
+        mem_ptr->tmp_block_comparisons[stream_idx];
     int_comparison_buffer<Torus> *current_reduction_buffer =
         mem_ptr->reduction_buffers[stream_idx];
 
@@ -75,10 +76,11 @@ __host__ void host_compute_equality_selectors(
 
   for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
     for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
-      cuda_event_record(mem_ptr->outgoing_events[j][i],
-                        mem_ptr->sub_streams_vec[j].stream(i),
-                        mem_ptr->sub_streams_vec[j].gpu_index(i));
-      cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i],
+      cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
+                        mem_ptr->sub_streams[j].stream(i),
+                        mem_ptr->sub_streams[j].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[j * num_gpus + i],
                              streams.gpu_index(0));
     }
   }
@@ -110,24 +112,25 @@ __host__ void host_create_possible_results(
   uint32_t max_luts_per_call = mem_ptr->max_luts_per_call;
   uint32_t num_lut_accumulators = mem_ptr->num_lut_accumulators;
   uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
 
   cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
                     streams.gpu_index(0));
 
   for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
     for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
-      cuda_stream_wait_event(mem_ptr->sub_streams_vec[j].stream(i),
+      cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
                              mem_ptr->incoming_event,
-                             mem_ptr->sub_streams_vec[j].gpu_index(i));
+                             mem_ptr->sub_streams[j].gpu_index(i));
     }
   }
 
   for (uint32_t i = 0; i < num_possible_values; i++) {
 
     uint32_t stream_idx = i % num_streams;
-    CudaStreams current_stream = mem_ptr->sub_streams_vec[stream_idx];
+    CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
     CudaRadixCiphertextFFI *current_tmp_buffer =
-        mem_ptr->tmp_many_luts_output_vec[stream_idx];
+        mem_ptr->tmp_many_luts_output[stream_idx];
 
     CudaRadixCiphertextFFI const *current_selector = &lwe_array_in_list[i];
     CudaRadixCiphertextFFI *current_output = &lwe_array_out_list[i];
@@ -138,7 +141,7 @@ __host__ void host_create_possible_results(
 
       uint32_t lut_index = stream_idx * num_lut_accumulators + k;
 
-      int_radix_lut<Torus> *current_lut = mem_ptr->stream_luts_vec[lut_index];
+      int_radix_lut<Torus> *current_lut = mem_ptr->stream_luts[lut_index];
 
       uint32_t luts_in_this_call = current_lut->num_many_lut;
 
@@ -172,10 +175,11 @@ __host__ void host_create_possible_results(
 
   for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
     for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
-      cuda_event_record(mem_ptr->outgoing_events[j][i],
-                        mem_ptr->sub_streams_vec[j].stream(i),
-                        mem_ptr->sub_streams_vec[j].gpu_index(i));
-      cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[j][i],
+      cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
+                        mem_ptr->sub_streams[j].stream(i),
+                        mem_ptr->sub_streams[j].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[j * num_gpus + i],
                              streams.gpu_index(0));
     }
   }
@@ -206,15 +210,16 @@ __host__ void host_aggregate_one_hot_vector(
   int_radix_params params = mem_ptr->params;
   uint32_t chunk_size = mem_ptr->chunk_size;
   uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
 
   cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
                     streams.gpu_index(0));
 
   for (uint32_t s = 0; s < num_streams; s++) {
     for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
-      cuda_stream_wait_event(mem_ptr->sub_streams_vec[s].stream(i),
+      cuda_stream_wait_event(mem_ptr->sub_streams[s].stream(i),
                              mem_ptr->incoming_event,
-                             mem_ptr->sub_streams_vec[s].gpu_index(i));
+                             mem_ptr->sub_streams[s].gpu_index(i));
     }
   }
 
@@ -223,7 +228,7 @@ __host__ void host_aggregate_one_hot_vector(
 
   for (uint32_t s = 0; s < num_streams; s++) {
 
-    CudaStreams current_stream = mem_ptr->sub_streams_vec[s];
+    CudaStreams current_stream = mem_ptr->sub_streams[s];
 
     CudaRadixCiphertextFFI *current_agg =
         mem_ptr->partial_aggregated_vectors[s];
@@ -287,10 +292,11 @@ __host__ void host_aggregate_one_hot_vector(
 
   for (uint32_t s = 0; s < num_streams; s++) {
     for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
-      cuda_event_record(mem_ptr->outgoing_events[s][i],
-                        mem_ptr->sub_streams_vec[s].stream(i),
-                        mem_ptr->sub_streams_vec[s].gpu_index(i));
-      cuda_stream_wait_event(streams.stream(0), mem_ptr->outgoing_events[s][i],
+      cuda_event_record(mem_ptr->outgoing_events[s * num_gpus + i],
+                        mem_ptr->sub_streams[s].stream(i),
+                        mem_ptr->sub_streams[s].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[s * num_gpus + i],
                              streams.gpu_index(0));
     }
   }
@@ -322,8 +328,8 @@ __host__ void host_aggregate_one_hot_vector(
       streams.stream(0), streams.gpu_index(0), temp_agg, 0, num_blocks,
       final_agg, 0, num_blocks);
 
-  CudaStreams message_stream = mem_ptr->sub_streams_vec[0];
-  CudaStreams carry_stream = mem_ptr->sub_streams_vec[1];
+  CudaStreams message_stream = mem_ptr->sub_streams[0];
+  CudaStreams carry_stream = mem_ptr->sub_streams[1];
 
   cuda_event_record(mem_ptr->reduction_done_event, streams.stream(0),
                     streams.gpu_index(0));
@@ -498,3 +504,609 @@ __host__ void host_unchecked_match_value_or(
                    mem_ptr->tmp_match_result, mem_ptr->tmp_or_value,
                    mem_ptr->cmux_buffer, bsks, (Torus **)ksks);
 }
+
+template <typename Torus>
+uint64_t
+scratch_cuda_unchecked_contains(CudaStreams streams,
+                                int_unchecked_contains_buffer<Torus> **mem_ptr,
+                                int_radix_params params, uint32_t num_inputs,
+                                uint32_t num_blocks, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_contains_buffer<Torus>(
+      streams, params, num_inputs, num_blocks, allocate_gpu_memory,
+      size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void
+host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output,
+                        CudaRadixCiphertextFFI const *inputs,
+                        CudaRadixCiphertextFFI const *value,
+                        uint32_t num_inputs, uint32_t num_blocks,
+                        int_unchecked_contains_buffer<Torus> *mem_ptr,
+                        void *const *bsks, Torus *const *ksks) {
+
+  cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
+                    streams.gpu_index(0));
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
+                             mem_ptr->incoming_event,
+                             mem_ptr->sub_streams[j].gpu_index(i));
+    }
+  }
+
+  uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
+
+  for (uint32_t i = 0; i < num_inputs; i++) {
+    uint32_t stream_idx = i % num_streams;
+    CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
+
+    CudaRadixCiphertextFFI const *input_ct = &inputs[i];
+
+    CudaRadixCiphertextFFI current_selector_block;
+    as_radix_ciphertext_slice<Torus>(&current_selector_block,
+                                     mem_ptr->packed_selectors, i, i + 1);
+
+    host_equality_check<Torus>(current_stream, &current_selector_block,
+                               input_ct, value, mem_ptr->eq_buffers[stream_idx],
+                               bsks, ksks, num_blocks);
+  }
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
+                        mem_ptr->sub_streams[j].stream(i),
+                        mem_ptr->sub_streams[j].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[j * num_gpus + i],
+                             streams.gpu_index(0));
+    }
+  }
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
+      bsks, (Torus **)ksks, num_inputs);
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_unchecked_contains_clear(
+    CudaStreams streams, int_unchecked_contains_clear_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
+    bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_contains_clear_buffer<Torus>(
+      streams, params, num_inputs, num_blocks, allocate_gpu_memory,
+      size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_unchecked_contains_clear(
+    CudaStreams streams, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *inputs, const uint64_t *h_clear_val,
+    uint32_t num_inputs, uint32_t num_blocks,
+    int_unchecked_contains_clear_buffer<Torus> *mem_ptr, void *const *bsks,
+    Torus *const *ksks) {
+
+  cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
+                           num_blocks * sizeof(Torus), streams.stream(0),
+                           streams.gpu_index(0));
+
+  set_trivial_radix_ciphertext_async<Torus>(
+      streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val,
+      mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
+      mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
+
+  cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
+                    streams.gpu_index(0));
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
+                             mem_ptr->incoming_event,
+                             mem_ptr->sub_streams[j].gpu_index(i));
+    }
+  }
+
+  uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
+
+  for (uint32_t i = 0; i < num_inputs; i++) {
+    uint32_t stream_idx = i % num_streams;
+    CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
+
+    CudaRadixCiphertextFFI const *input_ct = &inputs[i];
+
+    CudaRadixCiphertextFFI current_selector_block;
+    as_radix_ciphertext_slice<Torus>(&current_selector_block,
+                                     mem_ptr->packed_selectors, i, i + 1);
+
+    host_equality_check<Torus>(current_stream, &current_selector_block,
+                               input_ct, mem_ptr->tmp_clear_val,
+                               mem_ptr->eq_buffers[stream_idx], bsks, ksks,
+                               num_blocks);
+  }
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
+                        mem_ptr->sub_streams[j].stream(i),
+                        mem_ptr->sub_streams[j].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[j * num_gpus + i],
+                             streams.gpu_index(0));
+    }
+  }
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
+      bsks, (Torus **)ksks, num_inputs);
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_unchecked_is_in_clears(
+    CudaStreams streams, int_unchecked_is_in_clears_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
+    bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_is_in_clears_buffer<Torus>(
+      streams, params, num_clears, num_blocks, allocate_gpu_memory,
+      size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void
+host_unchecked_is_in_clears(CudaStreams streams, CudaRadixCiphertextFFI *output,
+                            CudaRadixCiphertextFFI const *input,
+                            const uint64_t *h_cleartexts, uint32_t num_clears,
+                            uint32_t num_blocks,
+                            int_unchecked_is_in_clears_buffer<Torus> *mem_ptr,
+                            void *const *bsks, Torus *const *ksks) {
+
+  host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
+                                         input, num_blocks, h_cleartexts,
+                                         mem_ptr->eq_buffer, bsks, ksks);
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
+      bsks, (Torus **)ksks, num_clears);
+}
+
+template <typename Torus>
+__host__ void host_compute_final_index_from_selectors(
+    CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
+    uint32_t num_inputs, uint32_t num_blocks_index,
+    int_final_index_from_selectors_buffer<Torus> *mem_ptr, void *const *bsks,
+    Torus *const *ksks) {
+
+  for (uint32_t i = 0; i < num_inputs; i++) {
+    CudaRadixCiphertextFFI const *src_selector = &selectors[i];
+
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), mem_ptr->packed_selectors, i,
+        i + 1, src_selector, 0, 1);
+  }
+
+  uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+  host_create_possible_results<Torus>(
+      streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
+      num_inputs, mem_ptr->h_indices, packed_len, mem_ptr->possible_results_buf,
+      bsks, ksks);
+
+  host_aggregate_one_hot_vector<Torus>(
+      streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
+      packed_len, mem_ptr->aggregate_buf, bsks, ksks);
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
+      bsks, (Torus **)ksks, num_inputs);
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_compute_final_index_from_selectors(
+    CudaStreams streams, int_final_index_from_selectors_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_inputs, uint32_t num_blocks_index,
+    bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_final_index_from_selectors_buffer<Torus>(
+      streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
+      size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_unchecked_index_in_clears(
+    CudaStreams streams, int_unchecked_index_in_clears_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t num_blocks_index, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_index_in_clears_buffer<Torus>(
+      streams, params, num_clears, num_blocks, num_blocks_index,
+      allocate_gpu_memory, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_unchecked_index_in_clears(
+    CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
+    const uint64_t *h_cleartexts, uint32_t num_clears, uint32_t num_blocks,
+    uint32_t num_blocks_index,
+    int_unchecked_index_in_clears_buffer<Torus> *mem_ptr, void *const *bsks,
+    Torus *const *ksks) {
+
+  host_compute_equality_selectors<Torus>(
+      streams, mem_ptr->final_index_buf->unpacked_selectors, input, num_blocks,
+      h_cleartexts, mem_ptr->eq_selectors_buf, bsks, ksks);
+
+  uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+  host_create_possible_results<Torus>(
+      streams, mem_ptr->final_index_buf->possible_results_ct_list,
+      mem_ptr->final_index_buf->unpacked_selectors, num_clears,
+      mem_ptr->final_index_buf->h_indices, packed_len,
+      mem_ptr->final_index_buf->possible_results_buf, bsks, ksks);
+
+  host_aggregate_one_hot_vector<Torus>(
+      streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list,
+      num_clears, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks,
+      ksks);
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, match_ct, mem_ptr->final_index_buf->packed_selectors,
+      mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks,
+      num_clears);
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_unchecked_first_index_in_clears(
+    CudaStreams streams,
+    int_unchecked_first_index_in_clears_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_unique, uint32_t num_blocks,
+    uint32_t num_blocks_index, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_first_index_in_clears_buffer<Torus>(
+      streams, params, num_unique, num_blocks, num_blocks_index,
+      allocate_gpu_memory, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_unchecked_first_index_in_clears(
+    CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
+    const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
+    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
+    int_unchecked_first_index_in_clears_buffer<Torus> *mem_ptr,
+    void *const *bsks, Torus *const *ksks) {
+
+  host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
+                                         input, num_blocks, h_unique_values,
+                                         mem_ptr->eq_selectors_buf, bsks, ksks);
+
+  uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+  host_create_possible_results<Torus>(
+      streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
+      num_unique, h_unique_indices, packed_len, mem_ptr->possible_results_buf,
+      bsks, ksks);
+
+  host_aggregate_one_hot_vector<Torus>(
+      streams, index_ct, mem_ptr->possible_results_ct_list, num_unique,
+      packed_len, mem_ptr->aggregate_buf, bsks, ksks);
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
+      bsks, (Torus **)ksks, num_unique);
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_unchecked_first_index_of_clear(
+    CudaStreams streams,
+    int_unchecked_first_index_of_clear_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_first_index_of_clear_buffer<Torus>(
+      streams, params, num_inputs, num_blocks, num_blocks_index,
+      allocate_gpu_memory, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_unchecked_first_index_of_clear(
+    CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
+    const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index,
+    int_unchecked_first_index_of_clear_buffer<Torus> *mem_ptr,
+    void *const *bsks, Torus *const *ksks) {
+
+  cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
+                           num_blocks * sizeof(Torus), streams.stream(0),
+                           streams.gpu_index(0));
+
+  set_trivial_radix_ciphertext_async<Torus>(
+      streams.stream(0), streams.gpu_index(0), mem_ptr->tmp_clear_val,
+      mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
+      mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
+
+  cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
+                    streams.gpu_index(0));
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
+                             mem_ptr->incoming_event,
+                             mem_ptr->sub_streams[j].gpu_index(i));
+    }
+  }
+
+  uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
+
+  for (uint32_t i = 0; i < num_inputs; i++) {
+    uint32_t stream_idx = i % num_streams;
+    CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
+
+    CudaRadixCiphertextFFI const *input_ct = &inputs[i];
+
+    CudaRadixCiphertextFFI current_selector_block;
+    as_radix_ciphertext_slice<Torus>(&current_selector_block,
+                                     mem_ptr->packed_selectors, i, i + 1);
+
+    host_equality_check<Torus>(current_stream, &current_selector_block,
+                               input_ct, mem_ptr->tmp_clear_val,
+                               mem_ptr->eq_buffers[stream_idx], bsks, ksks,
+                               num_blocks);
+  }
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
+                        mem_ptr->sub_streams[j].stream(i),
+                        mem_ptr->sub_streams[j].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[j * num_gpus + i],
+                             streams.gpu_index(0));
+    }
+  }
+
+  for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
+    uint32_t count = num_inputs - offset;
+
+    CudaRadixCiphertextFFI current_slice;
+    as_radix_ciphertext_slice<Torus>(&current_slice, mem_ptr->packed_selectors,
+                                     offset, num_inputs);
+
+    CudaRadixCiphertextFFI prev_slice;
+    as_radix_ciphertext_slice<Torus>(&prev_slice, mem_ptr->packed_selectors, 0,
+                                     count);
+
+    integer_radix_apply_bivariate_lookup_table<Torus>(
+        streams, &current_slice, &current_slice, &prev_slice, bsks, ksks,
+        mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus);
+  }
+
+  integer_radix_apply_univariate_lookup_table<Torus>(
+      streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks,
+      mem_ptr->cleanup_lut, num_inputs);
+
+  uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+  host_create_possible_results<Torus>(
+      streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
+      num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len,
+      mem_ptr->possible_results_buf, bsks, ksks);
+
+  host_aggregate_one_hot_vector<Torus>(
+      streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
+      packed_len, mem_ptr->aggregate_buf, bsks, ksks);
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
+      bsks, (Torus **)ksks, num_inputs);
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_unchecked_first_index_of(
+    CudaStreams streams, int_unchecked_first_index_of_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_first_index_of_buffer<Torus>(
+      streams, params, num_inputs, num_blocks, num_blocks_index,
+      allocate_gpu_memory, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_unchecked_first_index_of(
+    CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
+    CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
+    uint32_t num_blocks, uint32_t num_blocks_index,
+    int_unchecked_first_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
+    Torus *const *ksks) {
+
+  cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
+                    streams.gpu_index(0));
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
+                             mem_ptr->incoming_event,
+                             mem_ptr->sub_streams[j].gpu_index(i));
+    }
+  }
+
+  uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
+
+  for (uint32_t i = 0; i < num_inputs; i++) {
+    uint32_t stream_idx = i % num_streams;
+    CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
+
+    CudaRadixCiphertextFFI const *input_ct = &inputs[i];
+
+    CudaRadixCiphertextFFI current_selector_block;
+    as_radix_ciphertext_slice<Torus>(&current_selector_block,
+                                     mem_ptr->packed_selectors, i, i + 1);
+
+    host_equality_check<Torus>(current_stream, &current_selector_block,
+                               input_ct, value, mem_ptr->eq_buffers[stream_idx],
+                               bsks, ksks, num_blocks);
+  }
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
+                        mem_ptr->sub_streams[j].stream(i),
+                        mem_ptr->sub_streams[j].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[j * num_gpus + i],
+                             streams.gpu_index(0));
+    }
+  }
+
+  for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
+    uint32_t count = num_inputs - offset;
+
+    CudaRadixCiphertextFFI current_slice;
+    as_radix_ciphertext_slice<Torus>(&current_slice, mem_ptr->packed_selectors,
+                                     offset, num_inputs);
+
+    CudaRadixCiphertextFFI prev_slice;
+    as_radix_ciphertext_slice<Torus>(&prev_slice, mem_ptr->packed_selectors, 0,
+                                     count);
+
+    integer_radix_apply_bivariate_lookup_table<Torus>(
+        streams, &current_slice, &current_slice, &prev_slice, bsks, ksks,
+        mem_ptr->prefix_sum_lut, count, mem_ptr->params.message_modulus);
+  }
+
+  integer_radix_apply_univariate_lookup_table<Torus>(
+      streams, mem_ptr->packed_selectors, mem_ptr->packed_selectors, bsks, ksks,
+      mem_ptr->cleanup_lut, num_inputs);
+
+  uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+  host_create_possible_results<Torus>(
+      streams, mem_ptr->possible_results_ct_list, mem_ptr->unpacked_selectors,
+      num_inputs, (const uint64_t *)mem_ptr->h_indices, packed_len,
+      mem_ptr->possible_results_buf, bsks, ksks);
+
+  host_aggregate_one_hot_vector<Torus>(
+      streams, index_ct, mem_ptr->possible_results_ct_list, num_inputs,
+      packed_len, mem_ptr->aggregate_buf, bsks, ksks);
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, match_ct, mem_ptr->packed_selectors, mem_ptr->reduction_buf,
+      bsks, (Torus **)ksks, num_inputs);
+}
+
+template <typename Torus>
+uint64_t scratch_cuda_unchecked_index_of(
+    CudaStreams streams, int_unchecked_index_of_buffer<Torus> **mem_ptr,
+    int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
+    uint32_t num_blocks_index, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_unchecked_index_of_buffer<Torus>(
+      streams, params, num_inputs, num_blocks, num_blocks_index,
+      allocate_gpu_memory, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_unchecked_index_of(
+    CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
+    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
+    CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
+    uint32_t num_blocks, uint32_t num_blocks_index,
+    int_unchecked_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
+    Torus *const *ksks) {
+
+  cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
+                    streams.gpu_index(0));
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
+                             mem_ptr->incoming_event,
+                             mem_ptr->sub_streams[j].gpu_index(i));
+    }
+  }
+
+  uint32_t num_streams = mem_ptr->num_streams;
+  uint32_t num_gpus = mem_ptr->active_streams.count();
+
+  for (uint32_t i = 0; i < num_inputs; i++) {
+    uint32_t stream_idx = i % num_streams;
+    CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
+
+    CudaRadixCiphertextFFI const *input_ct = &inputs[i];
+
+    CudaRadixCiphertextFFI current_selector_block;
+    as_radix_ciphertext_slice<Torus>(&current_selector_block,
+                                     mem_ptr->final_index_buf->packed_selectors,
+                                     i, i + 1);
+
+    host_equality_check<Torus>(current_stream, &current_selector_block,
+                               input_ct, value, mem_ptr->eq_buffers[stream_idx],
+                               bsks, ksks, num_blocks);
+  }
+
+  for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
+    for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
+      cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
+                        mem_ptr->sub_streams[j].stream(i),
+                        mem_ptr->sub_streams[j].gpu_index(i));
+      cuda_stream_wait_event(streams.stream(0),
+                             mem_ptr->outgoing_events[j * num_gpus + i],
+                             streams.gpu_index(0));
+    }
+  }
+
+  uint32_t packed_len = (num_blocks_index + 1) / 2;
+
+  host_create_possible_results<Torus>(
+      streams, mem_ptr->final_index_buf->possible_results_ct_list,
+      mem_ptr->final_index_buf->unpacked_selectors, num_inputs,
+      (const uint64_t *)mem_ptr->final_index_buf->h_indices, packed_len,
+      mem_ptr->final_index_buf->possible_results_buf, bsks, ksks);
+
+  host_aggregate_one_hot_vector<Torus>(
+      streams, index_ct, mem_ptr->final_index_buf->possible_results_ct_list,
+      num_inputs, packed_len, mem_ptr->final_index_buf->aggregate_buf, bsks,
+      ksks);
+
+  host_integer_is_at_least_one_comparisons_block_true<Torus>(
+      streams, match_ct, mem_ptr->final_index_buf->packed_selectors,
+      mem_ptr->final_index_buf->reduction_buf, bsks, (Torus **)ksks,
+      num_inputs);
+}
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index de14e045e..875a520c4 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1100,45 +1100,6 @@ unsafe extern "C" {
 unsafe extern "C" {
     pub fn cleanup_cuda_integer_div_rem(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
-        streams: CudaStreamsFFI,
-        mem_ptr: *mut *mut i8,
-        input_lut: *const ffi::c_void,
-        lwe_dimension: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_radix_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        lut_degree: u64,
-        allocate_gpu_memory: bool,
-        noise_reduction_type: PBS_MS_REDUCTION_T,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_integer_compute_prefix_sum_hillis_steele_64(
-        streams: CudaStreamsFFI,
-        output_radix_lwe: *mut CudaRadixCiphertextFFI,
-        generates_or_propagates: *mut CudaRadixCiphertextFFI,
-        mem_ptr: *mut i8,
-        ksks: *const *mut ffi::c_void,
-        bsks: *const *mut ffi::c_void,
-        num_blocks: u32,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
 unsafe extern "C" {
     pub fn cuda_integer_reverse_blocks_64_inplace(
         streams: CudaStreamsFFI,
@@ -1715,127 +1676,6 @@ unsafe extern "C" {
 unsafe extern "C" {
     pub fn cleanup_cuda_integer_ilog2_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
-unsafe extern "C" {
-    pub fn scratch_cuda_compute_equality_selectors_64(
-        streams: CudaStreamsFFI,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_possible_values: u32,
-        num_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-        noise_reduction_type: PBS_MS_REDUCTION_T,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_compute_equality_selectors_64(
-        streams: CudaStreamsFFI,
-        lwe_array_out_list: *mut CudaRadixCiphertextFFI,
-        lwe_array_in: *const CudaRadixCiphertextFFI,
-        num_blocks: u32,
-        h_decomposed_cleartexts: *const u64,
-        mem: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_compute_equality_selectors_64(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn scratch_cuda_create_possible_results_64(
-        streams: CudaStreamsFFI,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_possible_values: u32,
-        num_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-        noise_reduction_type: PBS_MS_REDUCTION_T,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_create_possible_results_64(
-        streams: CudaStreamsFFI,
-        lwe_array_out_list: *mut CudaRadixCiphertextFFI,
-        lwe_array_in_list: *const CudaRadixCiphertextFFI,
-        num_possible_values: u32,
-        h_decomposed_cleartexts: *const u64,
-        num_blocks: u32,
-        mem: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_create_possible_results_64(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
-unsafe extern "C" {
-    pub fn scratch_cuda_aggregate_one_hot_vector_64(
-        streams: CudaStreamsFFI,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_blocks: u32,
-        num_matches: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-        noise_reduction_type: PBS_MS_REDUCTION_T,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_aggregate_one_hot_vector_64(
-        streams: CudaStreamsFFI,
-        lwe_array_out: *mut CudaRadixCiphertextFFI,
-        lwe_array_in_list: *const CudaRadixCiphertextFFI,
-        num_input_ciphertexts: u32,
-        num_blocks: u32,
-        mem: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_aggregate_one_hot_vector_64(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
 unsafe extern "C" {
     pub fn scratch_cuda_unchecked_match_value_64(
         streams: CudaStreamsFFI,
@@ -1962,6 +1802,385 @@ unsafe extern "C" {
         mem_ptr_void: *mut *mut i8,
     );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_contains_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_inputs: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_contains_64(
+        streams: CudaStreamsFFI,
+        output: *mut CudaRadixCiphertextFFI,
+        inputs: *const CudaRadixCiphertextFFI,
+        value: *const CudaRadixCiphertextFFI,
+        num_inputs: u32,
+        num_blocks: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_contains_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_contains_clear_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_inputs: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_contains_clear_64(
+        streams: CudaStreamsFFI,
+        output: *mut CudaRadixCiphertextFFI,
+        inputs: *const CudaRadixCiphertextFFI,
+        h_clear_val: *const u64,
+        num_inputs: u32,
+        num_blocks: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_contains_clear_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_is_in_clears_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_clears: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_is_in_clears_64(
+        streams: CudaStreamsFFI,
+        output: *mut CudaRadixCiphertextFFI,
+        input: *const CudaRadixCiphertextFFI,
+        h_cleartexts: *const u64,
+        num_clears: u32,
+        num_blocks: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_is_in_clears_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_compute_final_index_from_selectors_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_inputs: u32,
+        num_blocks_index: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_compute_final_index_from_selectors_64(
+        streams: CudaStreamsFFI,
+        index_ct: *mut CudaRadixCiphertextFFI,
+        match_ct: *mut CudaRadixCiphertextFFI,
+        selectors: *const CudaRadixCiphertextFFI,
+        num_inputs: u32,
+        num_blocks_index: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_compute_final_index_from_selectors_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_index_in_clears_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_clears: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_index_in_clears_64(
+        streams: CudaStreamsFFI,
+        index_ct: *mut CudaRadixCiphertextFFI,
+        match_ct: *mut CudaRadixCiphertextFFI,
+        input: *const CudaRadixCiphertextFFI,
+        h_cleartexts: *const u64,
+        num_clears: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_index_in_clears_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_first_index_in_clears_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_unique: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_first_index_in_clears_64(
+        streams: CudaStreamsFFI,
+        index_ct: *mut CudaRadixCiphertextFFI,
+        match_ct: *mut CudaRadixCiphertextFFI,
+        input: *const CudaRadixCiphertextFFI,
+        h_unique_values: *const u64,
+        h_unique_indices: *const u64,
+        num_unique: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_first_index_in_clears_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_first_index_of_clear_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_inputs: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_first_index_of_clear_64(
+        streams: CudaStreamsFFI,
+        index_ct: *mut CudaRadixCiphertextFFI,
+        match_ct: *mut CudaRadixCiphertextFFI,
+        inputs: *const CudaRadixCiphertextFFI,
+        h_clear_val: *const u64,
+        num_inputs: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_first_index_of_clear_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_first_index_of_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_inputs: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_first_index_of_64(
+        streams: CudaStreamsFFI,
+        index_ct: *mut CudaRadixCiphertextFFI,
+        match_ct: *mut CudaRadixCiphertextFFI,
+        inputs: *const CudaRadixCiphertextFFI,
+        value: *const CudaRadixCiphertextFFI,
+        num_inputs: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_first_index_of_64(
+        streams: CudaStreamsFFI,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_cuda_unchecked_index_of_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_inputs: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_unchecked_index_of_64(
+        streams: CudaStreamsFFI,
+        index_ct: *mut CudaRadixCiphertextFFI,
+        match_ct: *mut CudaRadixCiphertextFFI,
+        inputs: *const CudaRadixCiphertextFFI,
+        value: *const CudaRadixCiphertextFFI,
+        num_inputs: u32,
+        num_blocks: u32,
+        num_blocks_index: u32,
+        mem: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_unchecked_index_of_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
+}
 unsafe extern "C" {
     pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
         streams: CudaStreamsFFI,
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index a6e65b5ee..98b47a6ca 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -32,10 +32,13 @@ use crate::prelude::{CastFrom, CastInto};
 use crate::shortint::ciphertext::{Degree, NoiseLevel};
 use crate::shortint::parameters::ModulusSwitchType;
 use crate::shortint::{CarryModulus, MessageModulus};
+use crate::MatchValues;
 use itertools::Itertools;
+use rayon::prelude::*;
 pub use server_key::CudaServerKey;
 use std::any::TypeId;
 use std::cmp::min;
+use std::hash::Hash;
 use tfhe_cuda_backend::bindings::*;
 use tfhe_cuda_backend::cuda_bind::*;
 
@@ -6795,124 +6798,6 @@ pub(crate) unsafe fn cuda_backend_ilog2<T: UnsignedInteger, B: Numeric>(
     update_noise_degree(output, &cuda_ffi_output);
 }
 
-#[allow(clippy::too_many_arguments)]
-/// # Safety
-///
-/// - The data must not be moved or dropped while being used by the CUDA kernel.
-/// - This function assumes exclusive access to the passed data; violating this may lead to
-///   undefined behavior.
-pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele<
-    T: UnsignedInteger,
-    B: Numeric,
->(
-    streams: &CudaStreams,
-    output: &mut CudaSliceMut<T>,
-    output_degrees: &mut Vec<u64>,
-    output_noise_levels: &mut Vec<u64>,
-    generates_or_propagates: &mut CudaSliceMut<T>,
-    generates_or_propagates_degrees: &mut Vec<u64>,
-    generates_or_propagates_noise_levels: &mut Vec<u64>,
-    input_lut: &[T],
-    lut_degree: u64,
-    bootstrapping_key: &CudaVec<B>,
-    keyswitch_key: &CudaVec<T>,
-    lwe_dimension: LweDimension,
-    glwe_dimension: GlweDimension,
-    polynomial_size: PolynomialSize,
-    ks_level: DecompositionLevelCount,
-    ks_base_log: DecompositionBaseLog,
-    pbs_level: DecompositionLevelCount,
-    pbs_base_log: DecompositionBaseLog,
-    num_blocks: u32,
-    message_modulus: MessageModulus,
-    carry_modulus: CarryModulus,
-    pbs_type: PBSType,
-    grouping_factor: LweBskGroupingFactor,
-    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
-) {
-    assert_eq!(
-        streams.gpu_indexes[0],
-        generates_or_propagates.gpu_index(0),
-        "GPU error: first stream is on GPU {}, first generates_or_propagates pointer is on GPU {}",
-        streams.gpu_indexes[0].get(),
-        generates_or_propagates.gpu_index(0).get(),
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        output.gpu_index(0),
-        "GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
-        streams.gpu_indexes[0].get(),
-        output.gpu_index(0).get(),
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        bootstrapping_key.gpu_index(0),
-        "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
-        streams.gpu_indexes[0].get(),
-        bootstrapping_key.gpu_index(0).get(),
-    );
-    assert_eq!(
-        streams.gpu_indexes[0],
-        keyswitch_key.gpu_index(0),
-        "GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}",
-        streams.gpu_indexes[0].get(),
-        keyswitch_key.gpu_index(0).get(),
-    );
-
-    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
-
-    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut(
-        output,
-        output_degrees,
-        output_noise_levels,
-        num_blocks,
-        (glwe_dimension.0 * polynomial_size.0) as u32,
-    );
-    let mut cuda_ffi_generates_or_propagates = prepare_cuda_radix_ffi_from_slice_mut(
-        generates_or_propagates,
-        generates_or_propagates_degrees,
-        generates_or_propagates_noise_levels,
-        num_blocks,
-        (glwe_dimension.0 * polynomial_size.0) as u32,
-    );
-    scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
-        streams.ffi(),
-        std::ptr::addr_of_mut!(mem_ptr),
-        input_lut.as_ptr().cast(),
-        lwe_dimension.0 as u32,
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        ks_level.0 as u32,
-        ks_base_log.0 as u32,
-        pbs_level.0 as u32,
-        pbs_base_log.0 as u32,
-        grouping_factor.0 as u32,
-        num_blocks,
-        message_modulus.0 as u32,
-        carry_modulus.0 as u32,
-        pbs_type as u32,
-        lut_degree,
-        true,
-        noise_reduction_type as u32,
-    );
-
-    cuda_integer_compute_prefix_sum_hillis_steele_64(
-        streams.ffi(),
-        &raw mut cuda_ffi_output,
-        &raw mut cuda_ffi_generates_or_propagates,
-        mem_ptr,
-        keyswitch_key.ptr.as_ptr(),
-        bootstrapping_key.ptr.as_ptr(),
-        num_blocks,
-    );
-
-    cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
-        streams.ffi(),
-        std::ptr::addr_of_mut!(mem_ptr),
-    );
-}
-
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
@@ -8431,402 +8316,6 @@ pub(crate) unsafe fn cuda_backend_unchecked_bitnot_assign(
     update_noise_degree(ciphertext, &cuda_ffi_ciphertext);
 }
 
-#[allow(clippy::too_many_arguments)]
-/// # Safety
-///
-/// - The data must not be moved or dropped while being used by the CUDA kernel.
-/// - This function assumes exclusive access to the passed data; violating this may lead to
-///   undefined behavior.
-pub(crate) unsafe fn cuda_backend_compute_equality_selectors<T: UnsignedInteger, B: Numeric>(
-    streams: &CudaStreams,
-    lwe_array_out_list: &mut [CudaBooleanBlock],
-    lwe_array_in: &CudaRadixCiphertext,
-    h_decomposed_cleartexts: &[u64],
-    num_possible_values: u32,
-    num_blocks: u32,
-    message_modulus: MessageModulus,
-    carry_modulus: CarryModulus,
-    bootstrapping_key: &CudaVec<B>,
-    keyswitch_key: &CudaVec<T>,
-    glwe_dimension: GlweDimension,
-    polynomial_size: PolynomialSize,
-    big_lwe_dimension: LweDimension,
-    small_lwe_dimension: LweDimension,
-    ks_level: DecompositionLevelCount,
-    ks_base_log: DecompositionBaseLog,
-    pbs_level: DecompositionLevelCount,
-    pbs_base_log: DecompositionBaseLog,
-    pbs_type: PBSType,
-    grouping_factor: LweBskGroupingFactor,
-    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
-) {
-    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
-    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
-    assert_eq!(
-        streams.gpu_indexes[0],
-        lwe_array_in.d_blocks.0.d_vec.gpu_index(0)
-    );
-
-    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
-
-    let mut ffi_out_degrees: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_out_list.len());
-    let mut ffi_out_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_out_list.len());
-
-    let mut ffi_out_structs: Vec<CudaRadixCiphertextFFI> = lwe_array_out_list
-        .iter_mut()
-        .map(|ct| {
-            assert_eq!(
-                streams.gpu_indexes[0],
-                ct.0.ciphertext.d_blocks.0.d_vec.gpu_index(0)
-            );
-            ffi_out_degrees.push(vec![ct.0.ciphertext.info.blocks[0].degree.get()]);
-            ffi_out_noise_levels.push(vec![ct.0.ciphertext.info.blocks[0].noise_level.0]);
-            prepare_cuda_radix_ffi(
-                &ct.0.ciphertext,
-                ffi_out_degrees.last_mut().unwrap(),
-                ffi_out_noise_levels.last_mut().unwrap(),
-            )
-        })
-        .collect();
-
-    let mut ffi_in_degrees: Vec<u64> = lwe_array_in
-        .info
-        .blocks
-        .iter()
-        .map(|b| b.degree.get())
-        .collect();
-    let mut ffi_in_noise_levels: Vec<u64> = lwe_array_in
-        .info
-        .blocks
-        .iter()
-        .map(|b| b.noise_level.0)
-        .collect();
-
-    let ffi_in_struct: CudaRadixCiphertextFFI =
-        prepare_cuda_radix_ffi(lwe_array_in, &mut ffi_in_degrees, &mut ffi_in_noise_levels);
-
-    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_compute_equality_selectors_64(
-        streams.ffi(),
-        std::ptr::addr_of_mut!(mem_ptr),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        big_lwe_dimension.0 as u32,
-        small_lwe_dimension.0 as u32,
-        ks_level.0 as u32,
-        ks_base_log.0 as u32,
-        pbs_level.0 as u32,
-        pbs_base_log.0 as u32,
-        grouping_factor.0 as u32,
-        num_possible_values,
-        num_blocks,
-        message_modulus.0 as u32,
-        carry_modulus.0 as u32,
-        pbs_type as u32,
-        true,
-        noise_reduction_type as u32,
-    );
-
-    cuda_compute_equality_selectors_64(
-        streams.ffi(),
-        ffi_out_structs.as_mut_ptr(),
-        &raw const ffi_in_struct,
-        num_blocks,
-        h_decomposed_cleartexts.as_ptr(),
-        mem_ptr,
-        bootstrapping_key.ptr.as_ptr(),
-        keyswitch_key.ptr.as_ptr(),
-    );
-
-    cleanup_cuda_compute_equality_selectors_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
-
-    for (ct, ffi_struct) in lwe_array_out_list.iter_mut().zip(ffi_out_structs.iter()) {
-        update_noise_degree(&mut ct.0.ciphertext, ffi_struct);
-    }
-}
-
-#[allow(clippy::too_many_arguments)]
-/// # Safety
-///
-/// - The data must not be moved or dropped while being used by the CUDA kernel.
-/// - This function assumes exclusive access to the passed data; violating this may lead to
-///   undefined behavior.
-pub(crate) unsafe fn cuda_backend_create_possible_results<
-    T: UnsignedInteger,
-    B: Numeric,
-    R: CudaIntegerRadixCiphertext,
->(
-    streams: &CudaStreams,
-    lwe_array_out_list: &mut [R],
-    lwe_array_in_list: &[CudaBooleanBlock],
-    h_decomposed_cleartexts: &[u64],
-    num_blocks: u32,
-    message_modulus: MessageModulus,
-    carry_modulus: CarryModulus,
-    bootstrapping_key: &CudaVec<B>,
-    keyswitch_key: &CudaVec<T>,
-    glwe_dimension: GlweDimension,
-    polynomial_size: PolynomialSize,
-    big_lwe_dimension: LweDimension,
-    small_lwe_dimension: LweDimension,
-    ks_level: DecompositionLevelCount,
-    ks_base_log: DecompositionBaseLog,
-    pbs_level: DecompositionLevelCount,
-    pbs_base_log: DecompositionBaseLog,
-    pbs_type: PBSType,
-    grouping_factor: LweBskGroupingFactor,
-    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
-) {
-    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
-    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
-
-    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
-
-    let mut ffi_out_degrees: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_out_list.len());
-    let mut ffi_out_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_out_list.len());
-
-    let mut ffi_out_structs: Vec<CudaRadixCiphertextFFI> = lwe_array_out_list
-        .iter_mut()
-        .map(|ct| {
-            assert_eq!(
-                streams.gpu_indexes[0],
-                ct.as_ref().d_blocks.0.d_vec.gpu_index(0)
-            );
-            let degrees: Vec<u64> = ct
-                .as_ref()
-                .info
-                .blocks
-                .iter()
-                .map(|b| b.degree.get())
-                .collect();
-            let noise_levels: Vec<u64> = ct
-                .as_ref()
-                .info
-                .blocks
-                .iter()
-                .map(|b| b.noise_level.0)
-                .collect();
-
-            ffi_out_degrees.push(degrees);
-            ffi_out_noise_levels.push(noise_levels);
-
-            prepare_cuda_radix_ffi(
-                ct.as_ref(),
-                ffi_out_degrees.last_mut().unwrap(),
-                ffi_out_noise_levels.last_mut().unwrap(),
-            )
-        })
-        .collect();
-
-    let mut ffi_in_degrees: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_in_list.len());
-    let mut ffi_in_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_in_list.len());
-
-    let ffi_in_structs: Vec<CudaRadixCiphertextFFI> = lwe_array_in_list
-        .iter()
-        .map(|boolean_block| {
-            assert_eq!(
-                streams.gpu_indexes[0],
-                boolean_block.0.ciphertext.d_blocks.0.d_vec.gpu_index(0)
-            );
-
-            let degrees = vec![boolean_block.0.ciphertext.info.blocks[0].degree.get()];
-            let noise_levels = vec![boolean_block.0.ciphertext.info.blocks[0].noise_level.0];
-
-            ffi_in_degrees.push(degrees);
-            ffi_in_noise_levels.push(noise_levels);
-
-            prepare_cuda_radix_ffi(
-                &boolean_block.0.ciphertext,
-                ffi_in_degrees.last_mut().unwrap(),
-                ffi_in_noise_levels.last_mut().unwrap(),
-            )
-        })
-        .collect();
-
-    let num_possible_values = lwe_array_in_list.len() as u32;
-
-    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_create_possible_results_64(
-        streams.ffi(),
-        std::ptr::addr_of_mut!(mem_ptr),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        big_lwe_dimension.0 as u32,
-        small_lwe_dimension.0 as u32,
-        ks_level.0 as u32,
-        ks_base_log.0 as u32,
-        pbs_level.0 as u32,
-        pbs_base_log.0 as u32,
-        grouping_factor.0 as u32,
-        num_possible_values,
-        num_blocks,
-        message_modulus.0 as u32,
-        carry_modulus.0 as u32,
-        pbs_type as u32,
-        true,
-        noise_reduction_type as u32,
-    );
-
-    cuda_create_possible_results_64(
-        streams.ffi(),
-        ffi_out_structs.as_mut_ptr(),
-        ffi_in_structs.as_ptr(),
-        num_possible_values,
-        h_decomposed_cleartexts.as_ptr(),
-        num_blocks,
-        mem_ptr,
-        bootstrapping_key.ptr.as_ptr(),
-        keyswitch_key.ptr.as_ptr(),
-    );
-
-    cleanup_cuda_create_possible_results_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
-
-    for (i, ct) in lwe_array_out_list.iter_mut().enumerate() {
-        update_noise_degree(ct.as_mut(), &ffi_out_structs[i]);
-    }
-}
-
-#[allow(clippy::too_many_arguments)]
-/// # Safety
-///
-/// - The data must not be moved or dropped while being used by the CUDA kernel.
-/// - This function assumes exclusive access to the passed data; violating this may lead to
-///   undefined behavior.
-pub(crate) unsafe fn cuda_backend_aggregate_one_hot_vector<
-    T: UnsignedInteger,
-    B: Numeric,
-    R: CudaIntegerRadixCiphertext,
->(
-    streams: &CudaStreams,
-    lwe_array_out: &mut R,
-    lwe_array_in_list: &[R],
-    message_modulus: MessageModulus,
-    carry_modulus: CarryModulus,
-    bootstrapping_key: &CudaVec<B>,
-    keyswitch_key: &CudaVec<T>,
-    glwe_dimension: GlweDimension,
-    polynomial_size: PolynomialSize,
-    big_lwe_dimension: LweDimension,
-    small_lwe_dimension: LweDimension,
-    ks_level: DecompositionLevelCount,
-    ks_base_log: DecompositionBaseLog,
-    pbs_level: DecompositionLevelCount,
-    pbs_base_log: DecompositionBaseLog,
-    pbs_type: PBSType,
-    grouping_factor: LweBskGroupingFactor,
-    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
-) {
-    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
-    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
-    assert_eq!(
-        streams.gpu_indexes[0],
-        lwe_array_out.as_ref().d_blocks.0.d_vec.gpu_index(0)
-    );
-
-    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
-
-    let mut ffi_out_degrees: Vec<u64> = lwe_array_out
-        .as_ref()
-        .info
-        .blocks
-        .iter()
-        .map(|b| b.degree.get())
-        .collect();
-    let mut ffi_out_noise_levels: Vec<u64> = lwe_array_out
-        .as_ref()
-        .info
-        .blocks
-        .iter()
-        .map(|b| b.noise_level.0)
-        .collect();
-
-    let mut ffi_out_struct = prepare_cuda_radix_ffi(
-        lwe_array_out.as_ref(),
-        &mut ffi_out_degrees,
-        &mut ffi_out_noise_levels,
-    );
-
-    let mut ffi_in_degrees: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_in_list.len());
-    let mut ffi_in_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(lwe_array_in_list.len());
-
-    let ffi_in_structs: Vec<CudaRadixCiphertextFFI> = lwe_array_in_list
-        .iter()
-        .map(|ct| {
-            assert_eq!(
-                streams.gpu_indexes[0],
-                ct.as_ref().d_blocks.0.d_vec.gpu_index(0)
-            );
-            let degrees: Vec<u64> = ct
-                .as_ref()
-                .info
-                .blocks
-                .iter()
-                .map(|b| b.degree.get())
-                .collect();
-            let noise_levels: Vec<u64> = ct
-                .as_ref()
-                .info
-                .blocks
-                .iter()
-                .map(|b| b.noise_level.0)
-                .collect();
-
-            ffi_in_degrees.push(degrees);
-            ffi_in_noise_levels.push(noise_levels);
-
-            prepare_cuda_radix_ffi(
-                ct.as_ref(),
-                ffi_in_degrees.last_mut().unwrap(),
-                ffi_in_noise_levels.last_mut().unwrap(),
-            )
-        })
-        .collect();
-
-    let num_input_ciphertexts = lwe_array_in_list.len() as u32;
-    let num_blocks = lwe_array_in_list[0]
-        .as_ref()
-        .d_blocks
-        .lwe_ciphertext_count()
-        .0 as u32;
-
-    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
-    scratch_cuda_aggregate_one_hot_vector_64(
-        streams.ffi(),
-        std::ptr::addr_of_mut!(mem_ptr),
-        glwe_dimension.0 as u32,
-        polynomial_size.0 as u32,
-        big_lwe_dimension.0 as u32,
-        small_lwe_dimension.0 as u32,
-        ks_level.0 as u32,
-        ks_base_log.0 as u32,
-        pbs_level.0 as u32,
-        pbs_base_log.0 as u32,
-        grouping_factor.0 as u32,
-        num_blocks,
-        num_input_ciphertexts,
-        message_modulus.0 as u32,
-        carry_modulus.0 as u32,
-        pbs_type as u32,
-        true,
-        noise_reduction_type as u32,
-    );
-
-    cuda_aggregate_one_hot_vector_64(
-        streams.ffi(),
-        &raw mut ffi_out_struct,
-        ffi_in_structs.as_ptr(),
-        num_input_ciphertexts,
-        num_blocks,
-        mem_ptr,
-        bootstrapping_key.ptr.as_ptr(),
-        keyswitch_key.ptr.as_ptr(),
-    );
-
-    cleanup_cuda_aggregate_one_hot_vector_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
-
-    update_noise_degree(lwe_array_out.as_mut(), &ffi_out_struct);
-}
-
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
@@ -8837,17 +8326,13 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
     T: UnsignedInteger,
     B: Numeric,
     R: CudaIntegerRadixCiphertext,
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
 >(
     streams: &CudaStreams,
     lwe_array_out_result: &mut R,
     lwe_array_out_boolean: &mut CudaBooleanBlock,
     lwe_array_in_ct: &CudaRadixCiphertext,
-    h_match_inputs: &[u64],
-    h_match_outputs: &[u64],
-    num_matches: u32,
-    num_input_blocks: u32,
-    num_output_packed_blocks: u32,
-    max_output_is_zero: bool,
+    matches: &MatchValues<Clear>,
     message_modulus: MessageModulus,
     carry_modulus: CarryModulus,
     bootstrapping_key: &CudaVec<B>,
@@ -8885,6 +8370,50 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
             .gpu_index(0)
     );
 
+    let num_input_blocks = lwe_array_in_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_bits_in_message = message_modulus.0.ilog2();
+    let h_match_inputs: Vec<u64> = matches
+        .get_values()
+        .par_iter()
+        .map(|(input, _output)| *input)
+        .flat_map(|input_value: Clear| {
+            BlockDecomposer::new(input_value, num_bits_in_message)
+                .take(num_input_blocks as usize)
+                .map(|block_value: Clear| block_value.cast_into())
+                .collect::<Vec<u64>>()
+        })
+        .collect();
+
+    let max_output_value = matches
+        .get_values()
+        .iter()
+        .copied()
+        .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
+        .expect("luts is not empty at this point")
+        .1;
+
+    let num_output_unpacked_blocks = lwe_array_out_result
+        .as_ref()
+        .d_blocks
+        .lwe_ciphertext_count()
+        .0 as u32;
+    let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2);
+
+    let h_match_outputs: Vec<u64> = matches
+        .get_values()
+        .par_iter()
+        .map(|(_input, output)| *output)
+        .flat_map(|output_value: Clear| {
+            BlockDecomposer::new(output_value, 2 * num_bits_in_message)
+                .take(num_output_packed_blocks as usize)
+                .map(|block_value: Clear| block_value.cast_into())
+                .collect::<Vec<u64>>()
+        })
+        .collect();
+
+    let max_output_is_zero = max_output_value == Clear::ZERO;
+    let num_matches = matches.get_values().len() as u32;
+
     let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
 
     let mut ffi_out_result_degrees: Vec<u64> = lwe_array_out_result
@@ -8993,8 +8522,10 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
 /// - The data must not be moved or dropped while being used by the CUDA kernel.
 /// - This function assumes exclusive access to the passed data; violating this may lead to
 ///   undefined behavior.
-pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu(
+pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu<Clear>(
     streams: &CudaStreams,
+    ct: &CudaRadixCiphertext,
+    matches: &MatchValues<Clear>,
     glwe_dimension: GlweDimension,
     polynomial_size: PolynomialSize,
     big_lwe_dimension: LweDimension,
@@ -9007,12 +8538,34 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu(
     message_modulus: MessageModulus,
     carry_modulus: CarryModulus,
     pbs_type: PBSType,
-    num_matches: u32,
-    num_input_blocks: u32,
-    num_output_packed_blocks: u32,
-    max_output_is_zero: bool,
     ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
-) -> u64 {
+) -> u64
+where
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
+{
+    let num_input_blocks = ct.d_blocks.lwe_ciphertext_count().0 as u32;
+
+    let max_output_value = matches
+        .get_values()
+        .iter()
+        .copied()
+        .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
+        .expect("luts is not empty at this point")
+        .1;
+
+    let num_bits_in_message = message_modulus.0.ilog2();
+    let max_val_u64: u64 = max_output_value.cast_into();
+
+    let num_output_unpacked_blocks = if max_val_u64 == 0 {
+        1
+    } else {
+        (max_val_u64.ilog2() + 1).div_ceil(num_bits_in_message)
+    };
+
+    let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2);
+    let max_output_is_zero = max_output_value == Clear::ZERO;
+    let num_matches = matches.get_values().len() as u32;
+
     let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
 
@@ -9054,8 +8607,11 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu(
 /// - The data must not be moved or dropped while being used by the CUDA kernel.
 /// - This function assumes exclusive access to the passed data; violating this may lead to
 ///   undefined behavior.
-pub(crate) fn cuda_backend_get_unchecked_match_value_or_size_on_gpu(
+pub(crate) fn cuda_backend_get_unchecked_match_value_or_size_on_gpu<Clear>(
     streams: &CudaStreams,
+    ct: &CudaRadixCiphertext,
+    matches: &MatchValues<Clear>,
+    or_value: Clear,
     glwe_dimension: GlweDimension,
     polynomial_size: PolynomialSize,
     big_lwe_dimension: LweDimension,
@@ -9068,13 +8624,41 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_or_size_on_gpu(
     message_modulus: MessageModulus,
     carry_modulus: CarryModulus,
     pbs_type: PBSType,
-    num_matches: u32,
-    num_input_blocks: u32,
-    num_match_packed_blocks: u32,
-    num_output_blocks: u32,
-    max_output_is_zero: bool,
     ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
-) -> u64 {
+) -> u64
+where
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
+{
+    let num_input_blocks = ct.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_bits_in_message = message_modulus.0.ilog2();
+
+    let max_output_value = matches
+        .get_values()
+        .iter()
+        .copied()
+        .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
+        .expect("luts is not empty at this point")
+        .1;
+
+    let max_val_u64: u64 = max_output_value.cast_into();
+    let or_val_u64: u64 = or_value.cast_into();
+
+    let calc_blocks = |val: u64| -> u32 {
+        if val == 0 {
+            1
+        } else {
+            (val.ilog2() + 1).div_ceil(num_bits_in_message)
+        }
+    };
+
+    let num_blocks_match = calc_blocks(max_val_u64);
+    let num_blocks_or = calc_blocks(or_val_u64);
+
+    let num_output_blocks = num_blocks_match.max(num_blocks_or);
+    let num_match_packed_blocks = num_blocks_match.div_ceil(2);
+    let max_output_is_zero = max_output_value == Clear::ZERO;
+    let num_matches = matches.get_values().len() as u32;
+
     let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
     let mut mem_ptr: *mut i8 = std::ptr::null_mut();
 
@@ -9210,18 +8794,13 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
     T: UnsignedInteger,
     B: Numeric,
     R: CudaIntegerRadixCiphertext,
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
 >(
     streams: &CudaStreams,
     lwe_array_out: &mut R,
     lwe_array_in_ct: &CudaRadixCiphertext,
-    h_match_inputs: &[u64],
-    h_match_outputs: &[u64],
-    h_or_value: &[u64],
-    num_matches: u32,
-    num_input_blocks: u32,
-    num_match_packed_blocks: u32,
-    num_final_blocks: u32,
-    max_output_is_zero: bool,
+    matches: &MatchValues<Clear>,
+    or_value: Clear,
     message_modulus: MessageModulus,
     carry_modulus: CarryModulus,
     bootstrapping_key: &CudaVec<B>,
@@ -9238,8 +8817,72 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
     grouping_factor: LweBskGroupingFactor,
     ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_in_ct.d_blocks.0.d_vec.gpu_index(0)
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_out.as_ref().d_blocks.0.d_vec.gpu_index(0)
+    );
+
     let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
 
+    let num_input_blocks = lwe_array_in_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_bits_in_message = message_modulus.0.ilog2();
+
+    let h_match_inputs: Vec<u64> = matches
+        .get_values()
+        .par_iter()
+        .map(|(input, _output)| *input)
+        .flat_map(|input_value: Clear| {
+            BlockDecomposer::new(input_value, num_bits_in_message)
+                .take(num_input_blocks as usize)
+                .map(|block_value: Clear| block_value.cast_into())
+                .collect::<Vec<u64>>()
+        })
+        .collect();
+
+    let max_output_value = matches
+        .get_values()
+        .iter()
+        .copied()
+        .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
+        .expect("luts is not empty at this point")
+        .1;
+
+    let max_val_u64: u64 = max_output_value.cast_into();
+    let num_blocks_match = if max_val_u64 == 0 {
+        1
+    } else {
+        (max_val_u64.ilog2() + 1).div_ceil(num_bits_in_message)
+    };
+    let num_match_packed_blocks = num_blocks_match.div_ceil(2);
+
+    let h_match_outputs: Vec<u64> = matches
+        .get_values()
+        .par_iter()
+        .map(|(_input, output)| *output)
+        .flat_map(|output_value: Clear| {
+            BlockDecomposer::new(output_value, 2 * num_bits_in_message)
+                .take(num_match_packed_blocks as usize)
+                .map(|block_value: Clear| block_value.cast_into())
+                .collect::<Vec<u64>>()
+        })
+        .collect();
+
+    let num_final_blocks = lwe_array_out.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
+
+    let h_or_value: Vec<u64> = BlockDecomposer::new(or_value, num_bits_in_message)
+        .take(num_final_blocks as usize)
+        .map(|block_value: Clear| block_value.cast_into())
+        .collect();
+
+    let max_output_is_zero = max_output_value == Clear::ZERO;
+    let num_matches = matches.get_values().len() as u32;
+
     let mut ffi_out_degrees: Vec<u64> = lwe_array_out
         .as_ref()
         .info
@@ -9320,3 +8963,1192 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
 
     update_noise_degree(lwe_array_out.as_mut(), &ffi_out_struct);
 }
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_contains<
+    T: UnsignedInteger,
+    B: Numeric,
+    C: CudaIntegerRadixCiphertext,
+>(
+    streams: &CudaStreams,
+    output: &mut CudaBooleanBlock,
+    inputs: &[C],
+    value: &CudaRadixCiphertext,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], value.d_blocks.0.d_vec.gpu_index(0));
+
+    let num_inputs = inputs.len() as u32;
+    let num_blocks = value.d_blocks.lwe_ciphertext_count().0 as u32;
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut output_degrees = vec![output.0.ciphertext.info.blocks[0].degree.get()];
+    let mut output_noise_levels = vec![output.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_output = prepare_cuda_radix_ffi(
+        &output.0.ciphertext,
+        &mut output_degrees,
+        &mut output_noise_levels,
+    );
+
+    let mut value_degrees: Vec<u64> = value.info.blocks.iter().map(|b| b.degree.get()).collect();
+    let mut value_noise_levels: Vec<u64> =
+        value.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let ffi_value = prepare_cuda_radix_ffi(value, &mut value_degrees, &mut value_noise_levels);
+
+    let mut ffi_inputs_degrees: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let mut ffi_inputs_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let ffi_inputs: Vec<CudaRadixCiphertextFFI> = inputs
+        .iter()
+        .map(|ct| {
+            let degrees = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.degree.get())
+                .collect();
+            let noise_levels = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.noise_level.0)
+                .collect();
+            ffi_inputs_degrees.push(degrees);
+            ffi_inputs_noise_levels.push(noise_levels);
+
+            prepare_cuda_radix_ffi(
+                ct.as_ref(),
+                ffi_inputs_degrees.last_mut().unwrap(),
+                ffi_inputs_noise_levels.last_mut().unwrap(),
+            )
+        })
+        .collect();
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_contains_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_inputs,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_contains_64(
+        streams.ffi(),
+        &raw mut ffi_output,
+        ffi_inputs.as_ptr(),
+        &raw const ffi_value,
+        num_inputs,
+        num_blocks,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_contains_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(&mut output.0.ciphertext, &ffi_output);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_contains_clear<
+    T: UnsignedInteger,
+    B: Numeric,
+    C: CudaIntegerRadixCiphertext,
+    Clear: DecomposableInto<u64>,
+>(
+    streams: &CudaStreams,
+    output: &mut CudaBooleanBlock,
+    inputs: &[C],
+    clear: Clear,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    if !inputs.is_empty() {
+        assert_eq!(
+            streams.gpu_indexes[0],
+            inputs[0].as_ref().d_blocks.0.d_vec.gpu_index(0)
+        );
+    }
+
+    let num_inputs = inputs.len() as u32;
+    let num_blocks = inputs[0].as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_bits_in_message = message_modulus.0.ilog2();
+
+    let h_clear_blocks: Vec<u64> = BlockDecomposer::new(clear, num_bits_in_message)
+        .take(num_blocks as usize)
+        .map(|block_value| block_value.cast_into())
+        .collect();
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut output_degrees = vec![output.0.ciphertext.info.blocks[0].degree.get()];
+    let mut output_noise_levels = vec![output.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_output = prepare_cuda_radix_ffi(
+        &output.0.ciphertext,
+        &mut output_degrees,
+        &mut output_noise_levels,
+    );
+
+    let mut ffi_inputs_degrees: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let mut ffi_inputs_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let ffi_inputs: Vec<CudaRadixCiphertextFFI> = inputs
+        .iter()
+        .map(|ct| {
+            let degrees = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.degree.get())
+                .collect();
+            let noise_levels = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.noise_level.0)
+                .collect();
+            ffi_inputs_degrees.push(degrees);
+            ffi_inputs_noise_levels.push(noise_levels);
+
+            prepare_cuda_radix_ffi(
+                ct.as_ref(),
+                ffi_inputs_degrees.last_mut().unwrap(),
+                ffi_inputs_noise_levels.last_mut().unwrap(),
+            )
+        })
+        .collect();
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_contains_clear_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_inputs,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_contains_clear_64(
+        streams.ffi(),
+        &raw mut ffi_output,
+        ffi_inputs.as_ptr(),
+        h_clear_blocks.as_ptr(),
+        num_inputs,
+        num_blocks,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_contains_clear_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(&mut output.0.ciphertext, &ffi_output);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears<
+    T: UnsignedInteger,
+    B: Numeric,
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
+>(
+    streams: &CudaStreams,
+    output: &mut CudaBooleanBlock,
+    input: &CudaRadixCiphertext,
+    clears: &[Clear],
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], input.d_blocks.0.d_vec.gpu_index(0));
+
+    let num_clears = clears.len() as u32;
+    let num_blocks = input.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_bits_in_message = message_modulus.0.ilog2();
+
+    let h_decomposed_cleartexts: Vec<u64> = clears
+        .par_iter()
+        .flat_map(|input_value| {
+            BlockDecomposer::new(*input_value, num_bits_in_message)
+                .take(num_blocks as usize)
+                .map(|block_value: Clear| block_value.cast_into())
+                .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>();
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut output_degrees = vec![output.0.ciphertext.info.blocks[0].degree.get()];
+    let mut output_noise_levels = vec![output.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_output = prepare_cuda_radix_ffi(
+        &output.0.ciphertext,
+        &mut output_degrees,
+        &mut output_noise_levels,
+    );
+
+    let mut input_degrees: Vec<u64> = input.info.blocks.iter().map(|b| b.degree.get()).collect();
+    let mut input_noise_levels: Vec<u64> =
+        input.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels);
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_is_in_clears_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_clears,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_is_in_clears_64(
+        streams.ffi(),
+        &raw mut ffi_output,
+        &raw const ffi_input,
+        h_decomposed_cleartexts.as_ptr(),
+        num_clears,
+        num_blocks,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_is_in_clears_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(&mut output.0.ciphertext, &ffi_output);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_compute_final_index_from_selectors<
+    T: UnsignedInteger,
+    B: Numeric,
+>(
+    streams: &CudaStreams,
+    index_ct: &mut CudaRadixCiphertext,
+    match_ct: &mut CudaBooleanBlock,
+    selectors: &[CudaBooleanBlock],
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+
+    let num_inputs = selectors.len() as u32;
+    let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut index_degrees = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.get())
+        .collect();
+    let mut index_noise_levels = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut ffi_index =
+        prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels);
+
+    let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()];
+    let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_match = prepare_cuda_radix_ffi(
+        &match_ct.0.ciphertext,
+        &mut match_degrees,
+        &mut match_noise_levels,
+    );
+
+    let mut ffi_selectors_degrees: Vec<Vec<u64>> = Vec::with_capacity(selectors.len());
+    let mut ffi_selectors_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(selectors.len());
+    let ffi_selectors: Vec<CudaRadixCiphertextFFI> = selectors
+        .iter()
+        .map(|ct| {
+            let degrees = vec![ct.0.ciphertext.info.blocks[0].degree.get()];
+            let noise_levels = vec![ct.0.ciphertext.info.blocks[0].noise_level.0];
+            ffi_selectors_degrees.push(degrees);
+            ffi_selectors_noise_levels.push(noise_levels);
+
+            prepare_cuda_radix_ffi(
+                &ct.0.ciphertext,
+                ffi_selectors_degrees.last_mut().unwrap(),
+                ffi_selectors_noise_levels.last_mut().unwrap(),
+            )
+        })
+        .collect();
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_compute_final_index_from_selectors_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_inputs,
+        num_blocks_index,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_compute_final_index_from_selectors_64(
+        streams.ffi(),
+        &raw mut ffi_index,
+        &raw mut ffi_match,
+        ffi_selectors.as_ptr(),
+        num_inputs,
+        num_blocks_index,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_compute_final_index_from_selectors_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+
+    update_noise_degree(index_ct, &ffi_index);
+    update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears<
+    T: UnsignedInteger,
+    B: Numeric,
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
+>(
+    streams: &CudaStreams,
+    index_ct: &mut CudaRadixCiphertext,
+    match_ct: &mut CudaBooleanBlock,
+    input: &CudaRadixCiphertext,
+    clears: &[Clear],
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], input.d_blocks.0.d_vec.gpu_index(0));
+
+    let num_clears = clears.len() as u32;
+    let num_blocks = input.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_bits_in_message = message_modulus.0.ilog2();
+
+    let h_decomposed_cleartexts: Vec<u64> = clears
+        .par_iter()
+        .flat_map(|input_value| {
+            BlockDecomposer::new(*input_value, num_bits_in_message)
+                .take(num_blocks as usize)
+                .map(|block_value: Clear| block_value.cast_into())
+                .collect::<Vec<_>>()
+        })
+        .collect::<Vec<_>>();
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut index_degrees = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.get())
+        .collect();
+    let mut index_noise_levels = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut ffi_index =
+        prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels);
+
+    let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()];
+    let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_match = prepare_cuda_radix_ffi(
+        &match_ct.0.ciphertext,
+        &mut match_degrees,
+        &mut match_noise_levels,
+    );
+
+    let mut input_degrees: Vec<u64> = input.info.blocks.iter().map(|b| b.degree.get()).collect();
+    let mut input_noise_levels: Vec<u64> =
+        input.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels);
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_index_in_clears_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_clears,
+        num_blocks,
+        num_blocks_index,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_index_in_clears_64(
+        streams.ffi(),
+        &raw mut ffi_index,
+        &raw mut ffi_match,
+        &raw const ffi_input,
+        h_decomposed_cleartexts.as_ptr(),
+        num_clears,
+        num_blocks,
+        num_blocks_index,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_index_in_clears_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(index_ct, &ffi_index);
+    update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears<
+    T: UnsignedInteger,
+    B: Numeric,
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Hash + Sync + Send,
+>(
+    streams: &CudaStreams,
+    index_ct: &mut CudaRadixCiphertext,
+    match_ct: &mut CudaBooleanBlock,
+    input: &CudaRadixCiphertext,
+    clears: &[Clear],
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], input.d_blocks.0.d_vec.gpu_index(0));
+
+    let num_bits_in_message = message_modulus.0.ilog2();
+    let num_blocks = input.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+
+    let unique_elements: Vec<(usize, &Clear)> = clears
+        .iter()
+        .enumerate()
+        .unique_by(|&(_, value)| value)
+        .collect();
+
+    let num_unique = unique_elements.len() as u32;
+
+    let h_unique_values: Vec<u64> = unique_elements
+        .par_iter()
+        .flat_map(|(_, input_value)| {
+            BlockDecomposer::new(**input_value, num_bits_in_message)
+                .take(num_blocks as usize)
+                .map(|block_value: Clear| block_value.cast_into())
+                .collect::<Vec<_>>()
+        })
+        .collect();
+
+    let num_packed_blocks = (num_blocks_index as usize).div_ceil(2);
+    let bits_per_packed_block = 2 * num_bits_in_message;
+
+    let h_unique_indices: Vec<u64> = unique_elements
+        .par_iter()
+        .flat_map(|(index, _)| {
+            let val = *index as u64;
+            (0..num_packed_blocks).into_par_iter().map(move |b| {
+                let shift = b as u32 * bits_per_packed_block;
+                if shift >= 64 {
+                    0
+                } else {
+                    (val >> shift) & ((1 << bits_per_packed_block) - 1)
+                }
+            })
+        })
+        .collect();
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut index_degrees = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.get())
+        .collect();
+    let mut index_noise_levels = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut ffi_index =
+        prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels);
+
+    let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()];
+    let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_match = prepare_cuda_radix_ffi(
+        &match_ct.0.ciphertext,
+        &mut match_degrees,
+        &mut match_noise_levels,
+    );
+
+    let mut input_degrees: Vec<u64> = input.info.blocks.iter().map(|b| b.degree.get()).collect();
+    let mut input_noise_levels: Vec<u64> =
+        input.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let ffi_input = prepare_cuda_radix_ffi(input, &mut input_degrees, &mut input_noise_levels);
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_first_index_in_clears_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_unique,
+        num_blocks,
+        num_blocks_index,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_first_index_in_clears_64(
+        streams.ffi(),
+        &raw mut ffi_index,
+        &raw mut ffi_match,
+        &raw const ffi_input,
+        h_unique_values.as_ptr(),
+        h_unique_indices.as_ptr(),
+        num_unique,
+        num_blocks,
+        num_blocks_index,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_first_index_in_clears_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(index_ct, &ffi_index);
+    update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear<
+    T: UnsignedInteger,
+    B: Numeric,
+    C: CudaIntegerRadixCiphertext,
+    Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
+>(
+    streams: &CudaStreams,
+    index_ct: &mut CudaRadixCiphertext,
+    match_ct: &mut CudaBooleanBlock,
+    inputs: &[C],
+    clear: Clear,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+
+    let num_inputs = inputs.len() as u32;
+    let num_blocks = inputs[0].as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_bits_in_message = message_modulus.0.ilog2();
+
+    let h_clear_blocks: Vec<u64> = BlockDecomposer::new(clear, num_bits_in_message)
+        .take(num_blocks as usize)
+        .map(|block_value| block_value.cast_into())
+        .collect();
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut index_degrees = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.get())
+        .collect();
+    let mut index_noise_levels = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut ffi_index =
+        prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels);
+
+    let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()];
+    let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_match = prepare_cuda_radix_ffi(
+        &match_ct.0.ciphertext,
+        &mut match_degrees,
+        &mut match_noise_levels,
+    );
+
+    let mut ffi_inputs_degrees: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let mut ffi_inputs_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let ffi_inputs: Vec<CudaRadixCiphertextFFI> = inputs
+        .iter()
+        .map(|ct| {
+            let degrees = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.degree.get())
+                .collect();
+            let noise_levels = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.noise_level.0)
+                .collect();
+            ffi_inputs_degrees.push(degrees);
+            ffi_inputs_noise_levels.push(noise_levels);
+
+            prepare_cuda_radix_ffi(
+                ct.as_ref(),
+                ffi_inputs_degrees.last_mut().unwrap(),
+                ffi_inputs_noise_levels.last_mut().unwrap(),
+            )
+        })
+        .collect();
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_first_index_of_clear_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_inputs,
+        num_blocks,
+        num_blocks_index,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_first_index_of_clear_64(
+        streams.ffi(),
+        &raw mut ffi_index,
+        &raw mut ffi_match,
+        ffi_inputs.as_ptr(),
+        h_clear_blocks.as_ptr(),
+        num_inputs,
+        num_blocks,
+        num_blocks_index,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_first_index_of_clear_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(index_ct, &ffi_index);
+    update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_first_index_of<
+    T: UnsignedInteger,
+    B: Numeric,
+    C: CudaIntegerRadixCiphertext,
+>(
+    streams: &CudaStreams,
+    index_ct: &mut CudaRadixCiphertext,
+    match_ct: &mut CudaBooleanBlock,
+    inputs: &[C],
+    value: &CudaRadixCiphertext,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], value.d_blocks.0.d_vec.gpu_index(0));
+
+    let num_inputs = inputs.len() as u32;
+    let num_blocks = value.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut index_degrees = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.get())
+        .collect();
+    let mut index_noise_levels = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut ffi_index =
+        prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels);
+
+    let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()];
+    let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_match = prepare_cuda_radix_ffi(
+        &match_ct.0.ciphertext,
+        &mut match_degrees,
+        &mut match_noise_levels,
+    );
+
+    let mut value_degrees: Vec<u64> = value.info.blocks.iter().map(|b| b.degree.get()).collect();
+    let mut value_noise_levels: Vec<u64> =
+        value.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let ffi_value = prepare_cuda_radix_ffi(value, &mut value_degrees, &mut value_noise_levels);
+
+    let mut ffi_inputs_degrees: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let mut ffi_inputs_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let ffi_inputs: Vec<CudaRadixCiphertextFFI> = inputs
+        .iter()
+        .map(|ct| {
+            let degrees = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.degree.get())
+                .collect();
+            let noise_levels = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.noise_level.0)
+                .collect();
+            ffi_inputs_degrees.push(degrees);
+            ffi_inputs_noise_levels.push(noise_levels);
+
+            prepare_cuda_radix_ffi(
+                ct.as_ref(),
+                ffi_inputs_degrees.last_mut().unwrap(),
+                ffi_inputs_noise_levels.last_mut().unwrap(),
+            )
+        })
+        .collect();
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_first_index_of_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_inputs,
+        num_blocks,
+        num_blocks_index,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_first_index_of_64(
+        streams.ffi(),
+        &raw mut ffi_index,
+        &raw mut ffi_match,
+        ffi_inputs.as_ptr(),
+        &raw const ffi_value,
+        num_inputs,
+        num_blocks,
+        num_blocks_index,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_first_index_of_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(index_ct, &ffi_index);
+    update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match);
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_unchecked_index_of<
+    T: UnsignedInteger,
+    B: Numeric,
+    C: CudaIntegerRadixCiphertext,
+>(
+    streams: &CudaStreams,
+    index_ct: &mut CudaRadixCiphertext,
+    match_ct: &mut CudaBooleanBlock,
+    inputs: &[C],
+    value: &CudaRadixCiphertext,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], keyswitch_key.gpu_index(0));
+    assert_eq!(streams.gpu_indexes[0], value.d_blocks.0.d_vec.gpu_index(0));
+
+    let num_inputs = inputs.len() as u32;
+    let num_blocks = value.d_blocks.lwe_ciphertext_count().0 as u32;
+    let num_blocks_index = index_ct.d_blocks.lwe_ciphertext_count().0 as u32;
+
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut index_degrees = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.degree.get())
+        .collect();
+    let mut index_noise_levels = index_ct
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut ffi_index =
+        prepare_cuda_radix_ffi(index_ct, &mut index_degrees, &mut index_noise_levels);
+
+    let mut match_degrees = vec![match_ct.0.ciphertext.info.blocks[0].degree.get()];
+    let mut match_noise_levels = vec![match_ct.0.ciphertext.info.blocks[0].noise_level.0];
+    let mut ffi_match = prepare_cuda_radix_ffi(
+        &match_ct.0.ciphertext,
+        &mut match_degrees,
+        &mut match_noise_levels,
+    );
+
+    let mut value_degrees: Vec<u64> = value.info.blocks.iter().map(|b| b.degree.get()).collect();
+    let mut value_noise_levels: Vec<u64> =
+        value.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let ffi_value = prepare_cuda_radix_ffi(value, &mut value_degrees, &mut value_noise_levels);
+
+    let mut ffi_inputs_degrees: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let mut ffi_inputs_noise_levels: Vec<Vec<u64>> = Vec::with_capacity(inputs.len());
+    let ffi_inputs: Vec<CudaRadixCiphertextFFI> = inputs
+        .iter()
+        .map(|ct| {
+            let degrees = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.degree.get())
+                .collect();
+            let noise_levels = ct
+                .as_ref()
+                .info
+                .blocks
+                .iter()
+                .map(|b| b.noise_level.0)
+                .collect();
+            ffi_inputs_degrees.push(degrees);
+            ffi_inputs_noise_levels.push(noise_levels);
+
+            prepare_cuda_radix_ffi(
+                ct.as_ref(),
+                ffi_inputs_degrees.last_mut().unwrap(),
+                ffi_inputs_noise_levels.last_mut().unwrap(),
+            )
+        })
+        .collect();
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+
+    scratch_cuda_unchecked_index_of_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_inputs,
+        num_blocks,
+        num_blocks_index,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+
+    cuda_unchecked_index_of_64(
+        streams.ffi(),
+        &raw mut ffi_index,
+        &raw mut ffi_match,
+        ffi_inputs.as_ptr(),
+        &raw const ffi_value,
+        num_inputs,
+        num_blocks,
+        num_blocks_index,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+
+    cleanup_cuda_unchecked_index_of_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+
+    update_noise_degree(index_ct, &ffi_index);
+    update_noise_degree(&mut match_ct.0.ciphertext, &ffi_match);
+}
diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs
index e76ae5b1b..e0f7bcef3 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -18,7 +18,7 @@ use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
     cuda_backend_apply_bivariate_lut, cuda_backend_apply_many_univariate_lut,
     cuda_backend_apply_univariate_lut, cuda_backend_cast_to_unsigned,
-    cuda_backend_compute_prefix_sum_hillis_steele, cuda_backend_extend_radix_with_sign_msb,
+    cuda_backend_extend_radix_with_sign_msb,
     cuda_backend_extend_radix_with_trivial_zero_blocks_msb, cuda_backend_full_propagate_assign,
     cuda_backend_noise_squashing, cuda_backend_propagate_single_carry_assign,
     cuda_backend_trim_radix_blocks_lsb, cuda_backend_trim_radix_blocks_msb, CudaServerKey, PBSType,
@@ -1094,134 +1094,6 @@ impl CudaServerKey {
         ciphertexts
     }
 
-    /// Applies the lookup table on the range of ciphertexts
-    ///
-    /// The output must have exactly block_range.len() blocks
-    pub(crate) fn compute_prefix_sum_hillis_steele(
-        &self,
-        output: &mut CudaRadixCiphertext,
-        generates_or_propagates: &mut CudaRadixCiphertext,
-        lut: &BivariateLookupTableOwned,
-        block_range: std::ops::Range<usize>,
-        streams: &CudaStreams,
-    ) {
-        if block_range.is_empty() {
-            return;
-        }
-        assert_eq!(
-            generates_or_propagates.d_blocks.lwe_dimension(),
-            output.d_blocks.lwe_dimension()
-        );
-
-        let lwe_dimension = generates_or_propagates.d_blocks.lwe_dimension();
-        let lwe_size = lwe_dimension.to_lwe_size().0;
-        let num_blocks = block_range.len();
-
-        let mut generates_or_propagates_slice = generates_or_propagates
-            .d_blocks
-            .0
-            .d_vec
-            .as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0)
-            .unwrap();
-        let mut generates_or_propagates_degrees = vec![0; num_blocks];
-        let mut generates_or_propagates_noise_levels = vec![0; num_blocks];
-        for (i, block_index) in (block_range.clone()).enumerate() {
-            generates_or_propagates_degrees[i] =
-                generates_or_propagates.info.blocks[block_index].degree.0;
-            generates_or_propagates_noise_levels[i] = generates_or_propagates.info.blocks
-                [block_index]
-                .noise_level
-                .0;
-        }
-        let mut output_slice = output
-            .d_blocks
-            .0
-            .d_vec
-            .as_mut_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0)
-            .unwrap();
-        let mut output_degrees = vec![0_u64; num_blocks];
-        let mut output_noise_levels = vec![0_u64; num_blocks];
-        unsafe {
-            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => {
-                    cuda_backend_compute_prefix_sum_hillis_steele(
-                        streams,
-                        &mut output_slice,
-                        &mut output_degrees,
-                        &mut output_noise_levels,
-                        &mut generates_or_propagates_slice,
-                        &mut generates_or_propagates_degrees,
-                        &mut generates_or_propagates_noise_levels,
-                        lut.acc.acc.as_ref(),
-                        lut.acc.degree.0,
-                        &d_bsk.d_vec,
-                        &self.key_switching_key.d_vec,
-                        self.key_switching_key
-                            .output_key_lwe_size()
-                            .to_lwe_dimension(),
-                        d_bsk.glwe_dimension,
-                        d_bsk.polynomial_size,
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_bsk.decomp_level_count,
-                        d_bsk.decomp_base_log,
-                        num_blocks as u32,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        PBSType::Classical,
-                        LweBskGroupingFactor(0),
-                        d_bsk.ms_noise_reduction_configuration.as_ref(),
-                    );
-                }
-                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    cuda_backend_compute_prefix_sum_hillis_steele(
-                        streams,
-                        &mut output_slice,
-                        &mut output_degrees,
-                        &mut output_noise_levels,
-                        &mut generates_or_propagates_slice,
-                        &mut generates_or_propagates_degrees,
-                        &mut generates_or_propagates_noise_levels,
-                        lut.acc.acc.as_ref(),
-                        lut.acc.degree.0,
-                        &d_multibit_bsk.d_vec,
-                        &self.key_switching_key.d_vec,
-                        self.key_switching_key
-                            .output_key_lwe_size()
-                            .to_lwe_dimension(),
-                        d_multibit_bsk.glwe_dimension,
-                        d_multibit_bsk.polynomial_size,
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_multibit_bsk.decomp_level_count,
-                        d_multibit_bsk.decomp_base_log,
-                        num_blocks as u32,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        PBSType::MultiBit,
-                        d_multibit_bsk.grouping_factor,
-                        None,
-                    );
-                }
-            }
-        }
-
-        for (i, info) in output.info.blocks[block_range.start..block_range.end]
-            .iter_mut()
-            .enumerate()
-        {
-            info.degree = Degree(output_degrees[i]);
-            info.noise_level = NoiseLevel(output_noise_levels[i]);
-        }
-        for (i, info) in generates_or_propagates.info.blocks[block_range.start..block_range.end]
-            .iter_mut()
-            .enumerate()
-        {
-            info.degree = Degree(generates_or_propagates_degrees[i]);
-            info.noise_level = NoiseLevel(generates_or_propagates_noise_levels[i]);
-        }
-    }
-
     pub(crate) fn extend_radix_with_sign_msb<T: CudaIntegerRadixCiphertext>(
         &self,
         ct: &T,
diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
index 9c45e5944..26a2b0546 100644
--- a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
@@ -1,98 +1,24 @@
-use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::{LweBskGroupingFactor, UnsignedInteger};
-use crate::integer::block_decomposition::{BlockDecomposer, Decomposable, DecomposableInto};
+use crate::integer::block_decomposition::DecomposableInto;
 use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
-use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
 use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
-use crate::integer::gpu::server_key::radix::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
 use crate::integer::gpu::{
-    cuda_backend_aggregate_one_hot_vector, cuda_backend_compute_equality_selectors,
-    cuda_backend_create_possible_results, cuda_backend_get_unchecked_match_value_or_size_on_gpu,
-    cuda_backend_get_unchecked_match_value_size_on_gpu, cuda_backend_unchecked_match_value,
+    cuda_backend_compute_final_index_from_selectors,
+    cuda_backend_get_unchecked_match_value_or_size_on_gpu,
+    cuda_backend_get_unchecked_match_value_size_on_gpu, cuda_backend_unchecked_contains,
+    cuda_backend_unchecked_contains_clear, cuda_backend_unchecked_first_index_in_clears,
+    cuda_backend_unchecked_first_index_of, cuda_backend_unchecked_first_index_of_clear,
+    cuda_backend_unchecked_index_in_clears, cuda_backend_unchecked_index_of,
+    cuda_backend_unchecked_is_in_clears, cuda_backend_unchecked_match_value,
     cuda_backend_unchecked_match_value_or, PBSType,
 };
 pub use crate::integer::server_key::radix_parallel::MatchValues;
 use crate::prelude::CastInto;
-use itertools::Itertools;
-use rayon::prelude::*;
 use std::hash::Hash;
 
 impl CudaServerKey {
-    #[allow(clippy::unused_self)]
-    pub(crate) fn convert_selectors_to_unsigned_radix_ciphertext(
-        &self,
-        selectors: &[CudaBooleanBlock],
-        streams: &CudaStreams,
-    ) -> CudaUnsignedRadixCiphertext {
-        if selectors.is_empty() {
-            return self.create_trivial_radix(0, 1, streams);
-        }
-        let packed_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list(
-            selectors
-                .iter()
-                .map(|ciphertext| &ciphertext.0.ciphertext.d_blocks),
-            streams,
-        );
-        let vec_block_info: Vec<CudaBlockInfo> = selectors
-            .iter()
-            .flat_map(|ct| ct.0.ciphertext.info.blocks.clone())
-            .collect();
-        let radix_info = CudaRadixCiphertextInfo {
-            blocks: vec_block_info,
-        };
-        CudaIntegerRadixCiphertext::from(CudaRadixCiphertext {
-            d_blocks: packed_list,
-            info: radix_info,
-        })
-    }
-
-    pub(crate) fn convert_unsigned_radix_ciphertext_to_selectors(
-        &self,
-        ct: &mut CudaUnsignedRadixCiphertext,
-        streams: &CudaStreams,
-    ) -> Vec<CudaBooleanBlock> {
-        let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0;
-        let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size().0;
-        let mut unpacked_selectors = Vec::<CudaBooleanBlock>::with_capacity(num_blocks);
-        for i in 0..num_blocks {
-            let mut radix_ct: CudaUnsignedRadixCiphertext =
-                self.create_trivial_radix(0, 1, streams);
-            let slice_in = ct
-                .as_mut()
-                .d_blocks
-                .0
-                .d_vec
-                .as_mut_slice(i * lwe_size..(i + 1) * lwe_size, 0)
-                .unwrap();
-            let mut slice_out = radix_ct
-                .as_mut()
-                .d_blocks
-                .0
-                .d_vec
-                .as_mut_slice(0..lwe_size, 0)
-                .unwrap();
-            unsafe {
-                slice_out.copy_from_gpu_async(&slice_in, streams, 0);
-                streams.synchronize();
-            }
-            let boolean_block = CudaBooleanBlock::from_cuda_radix_ciphertext(radix_ct.into_inner());
-
-            unpacked_selectors.push(boolean_block);
-        }
-        unpacked_selectors
-    }
-
-    /// `match` an input value to an output value
-    ///
-    /// - Input values are not required to span all possible values that `ct` could hold.
-    ///
-    /// - The output radix has a number of blocks that depends on the maximum possible output value
-    ///   from the `MatchValues`
-    ///
-    /// Returns a boolean block that encrypts `true` if the input `ct`
-    /// matched one of the possible inputs
     pub fn unchecked_match_value<Clear>(
         &self,
         ct: &CudaUnsignedRadixCiphertext,
@@ -110,21 +36,6 @@ impl CudaServerKey {
             return (trivial_ct, trivial_bool);
         }
 
-        let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-        let num_bits_in_message = self.message_modulus.0.ilog2();
-
-        let h_match_inputs: Vec<u64> = matches
-            .get_values()
-            .par_iter()
-            .map(|(input, _output)| *input)
-            .flat_map(|input_value| {
-                BlockDecomposer::new(input_value, num_bits_in_message)
-                    .take(num_input_blocks as usize)
-                    .map(|block_value| block_value.cast_into())
-                    .collect::<Vec<_>>()
-            })
-            .collect::<Vec<_>>();
-
         let max_output_value = matches
             .get_values()
             .iter()
@@ -135,19 +46,6 @@ impl CudaServerKey {
 
         let num_output_unpacked_blocks =
             self.num_blocks_to_represent_unsigned_value(max_output_value);
-        let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2) as u32;
-
-        let h_match_outputs: Vec<u64> = matches
-            .get_values()
-            .par_iter()
-            .map(|(_input, output)| *output)
-            .flat_map(|output_value| {
-                BlockDecomposer::new(output_value, 2 * num_bits_in_message)
-                    .take(num_output_packed_blocks as usize)
-                    .map(|block_value| block_value.cast_into())
-                    .collect::<Vec<_>>()
-            })
-            .collect::<Vec<_>>();
 
         let mut result_ct: CudaUnsignedRadixCiphertext =
             self.create_trivial_zero_radix(num_output_unpacked_blocks, streams);
@@ -155,9 +53,6 @@ impl CudaServerKey {
             self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
         );
 
-        let max_output_is_zero = max_output_value == Clear::ZERO;
-        let num_matches = matches.get_values().len() as u32;
-
         unsafe {
             match &self.bootstrapping_key {
                 CudaBootstrappingKey::Classic(d_bsk) => {
@@ -166,12 +61,7 @@ impl CudaServerKey {
                         &mut result_ct,
                         &mut result_bool,
                         ct.as_ref(),
-                        &h_match_inputs,
-                        &h_match_outputs,
-                        num_matches,
-                        num_input_blocks,
-                        num_output_packed_blocks,
-                        max_output_is_zero,
+                        matches,
                         self.message_modulus,
                         self.carry_modulus,
                         &d_bsk.d_vec,
@@ -199,12 +89,7 @@ impl CudaServerKey {
                         &mut result_ct,
                         &mut result_bool,
                         ct.as_ref(),
-                        &h_match_inputs,
-                        &h_match_outputs,
-                        num_matches,
-                        num_input_blocks,
-                        num_output_packed_blocks,
-                        max_output_is_zero,
+                        matches,
                         self.message_modulus,
                         self.carry_modulus,
                         &d_multibit_bsk.d_vec,
@@ -239,33 +124,19 @@ impl CudaServerKey {
         streams: &CudaStreams,
     ) -> u64
     where
-        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize>,
+        Clear:
+            UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
     {
         if matches.get_values().is_empty() {
             return 0;
         }
 
-        let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-
-        let max_output_value = matches
-            .get_values()
-            .iter()
-            .copied()
-            .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
-            .expect("luts is not empty at this point")
-            .1;
-
-        let num_output_unpacked_blocks =
-            self.num_blocks_to_represent_unsigned_value(max_output_value);
-        let num_output_packed_blocks = num_output_unpacked_blocks.div_ceil(2) as u32;
-
-        let max_output_is_zero = max_output_value == Clear::ZERO;
-        let num_matches = matches.get_values().len() as u32;
-
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 cuda_backend_get_unchecked_match_value_size_on_gpu(
                     streams,
+                    ct.as_ref(),
+                    matches,
                     d_bsk.glwe_dimension,
                     d_bsk.polynomial_size,
                     self.key_switching_key
@@ -282,16 +153,14 @@ impl CudaServerKey {
                     self.message_modulus,
                     self.carry_modulus,
                     PBSType::Classical,
-                    num_matches,
-                    num_input_blocks,
-                    num_output_packed_blocks,
-                    max_output_is_zero,
                     d_bsk.ms_noise_reduction_configuration.as_ref(),
                 )
             }
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 cuda_backend_get_unchecked_match_value_size_on_gpu(
                     streams,
+                    ct.as_ref(),
+                    matches,
                     d_multibit_bsk.glwe_dimension,
                     d_multibit_bsk.polynomial_size,
                     self.key_switching_key
@@ -308,10 +177,6 @@ impl CudaServerKey {
                     self.message_modulus,
                     self.carry_modulus,
                     PBSType::MultiBit,
-                    num_matches,
-                    num_input_blocks,
-                    num_output_packed_blocks,
-                    max_output_is_zero,
                     None,
                 )
             }
@@ -408,7 +273,7 @@ impl CudaServerKey {
         streams: &CudaStreams,
     ) -> CudaUnsignedRadixCiphertext
     where
-        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize>,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64>,
     {
         if matches.get_values().is_empty() {
             let num_blocks = self.num_blocks_to_represent_unsigned_value(or_value);
@@ -417,21 +282,6 @@ impl CudaServerKey {
             return ct;
         }
 
-        let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-        let num_bits_in_message = self.message_modulus.0.ilog2();
-
-        let h_match_inputs: Vec<u64> = matches
-            .get_values()
-            .par_iter()
-            .map(|(input, _output)| *input)
-            .flat_map(|input_value| {
-                BlockDecomposer::new(input_value, num_bits_in_message)
-                    .take(num_input_blocks as usize)
-                    .map(|block_value| block_value.cast_into())
-                    .collect::<Vec<_>>()
-            })
-            .collect::<Vec<_>>();
-
         let max_output_value_match = matches
             .get_values()
             .iter()
@@ -444,31 +294,9 @@ impl CudaServerKey {
         let num_blocks_or = self.num_blocks_to_represent_unsigned_value(or_value);
         let final_num_blocks = num_blocks_match.max(num_blocks_or);
 
-        let num_match_packed_blocks = num_blocks_match.div_ceil(2) as u32;
-
-        let h_match_outputs: Vec<u64> = matches
-            .get_values()
-            .par_iter()
-            .map(|(_input, output)| *output)
-            .flat_map(|output_value| {
-                BlockDecomposer::new(output_value, 2 * num_bits_in_message)
-                    .take(num_match_packed_blocks as usize)
-                    .map(|block_value| block_value.cast_into())
-                    .collect::<Vec<_>>()
-            })
-            .collect::<Vec<_>>();
-
-        let h_or_value: Vec<u64> = BlockDecomposer::new(or_value, num_bits_in_message)
-            .take(final_num_blocks)
-            .map(|block_value| block_value.cast_into())
-            .collect();
-
         let mut result: CudaUnsignedRadixCiphertext =
             self.create_trivial_zero_radix(final_num_blocks, streams);
 
-        let max_output_is_zero = max_output_value_match == Clear::ZERO;
-        let num_matches = matches.get_values().len() as u32;
-
         unsafe {
             match &self.bootstrapping_key {
                 CudaBootstrappingKey::Classic(d_bsk) => {
@@ -476,14 +304,8 @@ impl CudaServerKey {
                         streams,
                         &mut result,
                         ct.as_ref(),
-                        &h_match_inputs,
-                        &h_match_outputs,
-                        &h_or_value,
-                        num_matches,
-                        num_input_blocks,
-                        num_match_packed_blocks,
-                        final_num_blocks as u32,
-                        max_output_is_zero,
+                        matches,
+                        or_value,
                         self.message_modulus,
                         self.carry_modulus,
                         &d_bsk.d_vec,
@@ -510,14 +332,8 @@ impl CudaServerKey {
                         streams,
                         &mut result,
                         ct.as_ref(),
-                        &h_match_inputs,
-                        &h_match_outputs,
-                        &h_or_value,
-                        num_matches,
-                        num_input_blocks,
-                        num_match_packed_blocks,
-                        final_num_blocks as u32,
-                        max_output_is_zero,
+                        matches,
+                        or_value,
                         self.message_modulus,
                         self.carry_modulus,
                         &d_multibit_bsk.d_vec,
@@ -553,35 +369,20 @@ impl CudaServerKey {
         streams: &CudaStreams,
     ) -> u64
     where
-        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize>,
+        Clear:
+            UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
     {
         if matches.get_values().is_empty() {
             return 0;
         }
 
-        let num_input_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-
-        let max_output_value_match = matches
-            .get_values()
-            .iter()
-            .copied()
-            .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
-            .expect("luts is not empty at this point")
-            .1;
-
-        let num_blocks_match = self.num_blocks_to_represent_unsigned_value(max_output_value_match);
-        let num_blocks_or = self.num_blocks_to_represent_unsigned_value(or_value);
-        let final_num_blocks = num_blocks_match.max(num_blocks_or);
-
-        let num_match_packed_blocks = num_blocks_match.div_ceil(2) as u32;
-
-        let max_output_is_zero = max_output_value_match == Clear::ZERO;
-        let num_matches = matches.get_values().len() as u32;
-
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
                 cuda_backend_get_unchecked_match_value_or_size_on_gpu(
                     streams,
+                    ct.as_ref(),
+                    matches,
+                    or_value,
                     d_bsk.glwe_dimension,
                     d_bsk.polynomial_size,
                     self.key_switching_key
@@ -598,17 +399,15 @@ impl CudaServerKey {
                     self.message_modulus,
                     self.carry_modulus,
                     PBSType::Classical,
-                    num_matches,
-                    num_input_blocks,
-                    num_match_packed_blocks,
-                    final_num_blocks as u32,
-                    max_output_is_zero,
                     d_bsk.ms_noise_reduction_configuration.as_ref(),
                 )
             }
             CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                 cuda_backend_get_unchecked_match_value_or_size_on_gpu(
                     streams,
+                    ct.as_ref(),
+                    matches,
+                    or_value,
                     d_multibit_bsk.glwe_dimension,
                     d_multibit_bsk.polynomial_size,
                     self.key_switching_key
@@ -625,11 +424,6 @@ impl CudaServerKey {
                     self.message_modulus,
                     self.carry_modulus,
                     PBSType::MultiBit,
-                    num_matches,
-                    num_input_blocks,
-                    num_match_packed_blocks,
-                    final_num_blocks as u32,
-                    max_output_is_zero,
                     None,
                 )
             }
@@ -721,15 +515,71 @@ impl CudaServerKey {
             let d_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams);
             return CudaBooleanBlock::from_cuda_radix_ciphertext(d_ct.ciphertext);
         }
-        //Here It would be better to launch them in parallel maybe using different streams or
-        // packed them in a vector
-        let selectors = cts
-            .iter()
-            .map(|ct| self.eq(ct, value, streams))
-            .collect::<Vec<_>>();
 
-        let packed_ct = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams);
-        self.unchecked_is_at_least_one_comparisons_block_true(&packed_ct, streams)
+        let mut result = CudaBooleanBlock::from_cuda_radix_ciphertext(
+            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams)
+                .into_inner(),
+        );
+
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_contains(
+                        streams,
+                        &mut result,
+                        cts,
+                        value.as_ref(),
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_contains(
+                        streams,
+                        &mut result,
+                        cts,
+                        value.as_ref(),
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
+        result
     }
 
     /// Returns an encrypted `true` if the encrypted `value` is found in the encrypted slice
@@ -821,13 +671,71 @@ impl CudaServerKey {
             );
             return trivial_bool;
         }
-        let selectors = cts
-            .iter()
-            .map(|ct| self.scalar_eq(ct, clear, streams))
-            .collect::<Vec<_>>();
 
-        let packed_ct = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams);
-        self.unchecked_is_at_least_one_comparisons_block_true(&packed_ct, streams)
+        let mut result = CudaBooleanBlock::from_cuda_radix_ciphertext(
+            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams)
+                .into_inner(),
+        );
+
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_contains_clear(
+                        streams,
+                        &mut result,
+                        cts,
+                        clear,
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_contains_clear(
+                        streams,
+                        &mut result,
+                        cts,
+                        clear,
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
+        result
     }
 
     /// Returns an encrypted `true` if the clear `value` is found in the encrypted slice
@@ -903,7 +811,7 @@ impl CudaServerKey {
     ) -> CudaBooleanBlock
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize>,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
     {
         if clears.is_empty() {
             let trivial_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams);
@@ -912,10 +820,69 @@ impl CudaServerKey {
             );
             return trivial_bool;
         }
-        let selectors = self.compute_equality_selectors(ct, clears.par_iter().copied(), streams);
 
-        let blocks_ct = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams);
-        self.unchecked_is_at_least_one_comparisons_block_true(&blocks_ct, streams)
+        let ct_res: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams);
+        let mut boolean_res = CudaBooleanBlock::from_cuda_radix_ciphertext(ct_res.into_inner());
+
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_is_in_clears(
+                        streams,
+                        &mut boolean_res,
+                        ct.as_ref(),
+                        clears,
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_is_in_clears(
+                        streams,
+                        &mut boolean_res,
+                        ct.as_ref(),
+                        clears,
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
+        boolean_res
     }
 
     /// Returns an encrypted `true` if the encrypted `value` is found in the clear slice
@@ -963,7 +930,7 @@ impl CudaServerKey {
     ) -> CudaBooleanBlock
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize>,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
     {
         let mut tmp_ct;
         let ct = if ct.block_carries_are_empty() {
@@ -983,7 +950,6 @@ impl CudaServerKey {
     ///
     /// - clear values in the slice must be unique (otherwise use
     ///   [Self::unchecked_first_index_in_clears])
-    /// - If the encrypted value is not in the clear slice, the returned index is 0
     pub fn unchecked_index_in_clears<T, Clear>(
         &self,
         ct: &T,
@@ -992,7 +958,7 @@ impl CudaServerKey {
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize>,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
     {
         if clears.is_empty() {
             let trivial_ct2: CudaUnsignedRadixCiphertext = self.create_trivial_radix(
@@ -1006,8 +972,80 @@ impl CudaServerKey {
             );
             return (trivial_ct2, trivial_bool);
         }
-        let selectors = self.compute_equality_selectors(ct, clears.par_iter().copied(), streams);
-        self.compute_final_index_from_selectors(selectors, streams)
+
+        let num_clears = clears.len();
+        let num_blocks_index =
+            (num_clears.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
+
+        let mut index_ct: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_blocks_index, streams);
+
+        let trivial_bool =
+            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams);
+        let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner());
+
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_index_in_clears(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        ct.as_ref(),
+                        clears,
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_index_in_clears(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        ct.as_ref(),
+                        clears,
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
+
+        (index_ct, match_ct)
     }
 
     /// Returns the encrypted index of the encrypted `value` in the clear slice
@@ -1063,7 +1101,7 @@ impl CudaServerKey {
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize>,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
     {
         let mut tmp_ct;
         let ct = if ct.block_carries_are_empty() {
@@ -1093,7 +1131,7 @@ impl CudaServerKey {
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize> + Hash,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Hash + Sync + Send,
     {
         if clears.is_empty() {
             let trivial_ct2: CudaUnsignedRadixCiphertext = self.create_trivial_radix(
@@ -1107,34 +1145,79 @@ impl CudaServerKey {
             );
             return (trivial_ct2, trivial_bool);
         }
-        let unique_clears = clears
-            .iter()
-            .copied()
-            .enumerate()
-            .unique_by(|&(_, value)| value)
-            .collect::<Vec<_>>();
-        let selectors = self.compute_equality_selectors(
-            ct,
-            unique_clears.par_iter().copied().map(|(_, value)| value),
-            streams,
-        );
 
-        let selectors2 = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams);
         let num_blocks_result =
             (clears.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
 
-        let possible_values = self.create_possible_results(
-            num_blocks_result,
-            selectors
-                .into_par_iter()
-                .zip(unique_clears.into_par_iter().map(|(index, _)| index as u64)),
-            streams,
-        );
+        let mut index_ct: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_blocks_result, streams);
 
-        let out_ct = self.aggregate_one_hot_vector(&possible_values, streams);
+        let trivial_bool =
+            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams);
+        let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner());
 
-        let block = self.unchecked_is_at_least_one_comparisons_block_true(&selectors2, streams);
-        (out_ct, block)
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_first_index_in_clears(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        ct.as_ref(),
+                        clears,
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_first_index_in_clears(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        ct.as_ref(),
+                        clears,
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
+
+        (index_ct, match_ct)
     }
 
     /// Returns the encrypted index of the _first_ occurrence of encrypted `value` in the clear
@@ -1190,7 +1273,7 @@ impl CudaServerKey {
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize> + Hash,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Hash + Sync + Send,
     {
         let mut tmp_ct;
         let ct = if ct.block_carries_are_empty() {
@@ -1205,13 +1288,6 @@ impl CudaServerKey {
         self.unchecked_first_index_in_clears(ct, clears, streams)
     }
 
-    /// Returns the encrypted index of the of encrypted `value` in the ciphertext slice
-    /// also, it returns an encrypted boolean that is `true` if the encrypted value was found.
-    ///
-    /// # Notes
-    ///
-    /// - clear values in the slice must be unique (otherwise use [Self::unchecked_first_index_of])
-    /// - If the encrypted value is not in the encrypted slice, the returned index is 0
     pub fn unchecked_index_of<T>(
         &self,
         cts: &[T],
@@ -1228,12 +1304,79 @@ impl CudaServerKey {
             );
             return (trivial_ct, trivial_bool);
         }
-        let selectors = cts
-            .iter()
-            .map(|ct| self.eq(ct, value, streams))
-            .collect::<Vec<_>>();
 
-        self.compute_final_index_from_selectors(selectors, streams)
+        let num_inputs = cts.len();
+        let num_blocks_index =
+            (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
+
+        let mut index_ct: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_blocks_index, streams);
+
+        let trivial_bool: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, streams);
+        let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner());
+
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_index_of(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        cts,
+                        value.as_ref(),
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_index_of(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        cts,
+                        value.as_ref(),
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
+
+        (index_ct, match_ct)
     }
 
     /// Returns the encrypted index of the of encrypted `value` in the ciphertext slice
@@ -1352,7 +1495,7 @@ impl CudaServerKey {
             .map(|ct| self.scalar_eq(ct, clear, streams))
             .collect::<Vec<_>>();
 
-        self.compute_final_index_from_selectors(selectors, streams)
+        self.compute_final_index_from_selectors(&selectors, streams)
     }
 
     /// Returns the encrypted index of the of clear `value` in the ciphertext slice
@@ -1446,7 +1589,7 @@ impl CudaServerKey {
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize>,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
     {
         if cts.is_empty() {
             let trivial_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams);
@@ -1455,34 +1598,80 @@ impl CudaServerKey {
             );
             return (trivial_ct, trivial_bool);
         }
+
+        let num_inputs = cts.len();
         let num_blocks_result =
-            (cts.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
+            (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
 
-        let selectors = cts
-            .iter()
-            .map(|ct| self.scalar_eq(ct, clear, streams))
-            .collect::<Vec<_>>();
+        let mut index_ct: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_blocks_result, streams);
 
-        let packed_selectors =
-            self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams);
-        let mut only_first_selectors = self.only_keep_first_true(packed_selectors, streams);
+        let trivial_bool =
+            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams);
+        let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner());
 
-        let unpacked_selectors =
-            self.convert_unsigned_radix_ciphertext_to_selectors(&mut only_first_selectors, streams);
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_first_index_of_clear(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        cts,
+                        clear,
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_first_index_of_clear(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        cts,
+                        clear,
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
 
-        let possible_values = self.create_possible_results(
-            num_blocks_result,
-            unpacked_selectors
-                .into_par_iter()
-                .enumerate()
-                .map(|(i, v)| (v, i as u64)),
-            streams,
-        );
-        let out_ct = self.aggregate_one_hot_vector(&possible_values, streams);
-
-        let block =
-            self.unchecked_is_at_least_one_comparisons_block_true(&only_first_selectors, streams);
-        (out_ct, block)
+        (index_ct, match_ct)
     }
 
     /// Returns the encrypted index of the _first_ occurrence of clear `value` in the ciphertext
@@ -1539,7 +1728,7 @@ impl CudaServerKey {
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
     where
         T: CudaIntegerRadixCiphertext,
-        Clear: DecomposableInto<u64> + CastInto<usize>,
+        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + Sync + Send,
     {
         let mut tmp_cts = Vec::<T>::with_capacity(cts.len());
 
@@ -1584,35 +1773,79 @@ impl CudaServerKey {
             return (trivial_ct, trivial_bool);
         }
 
+        let num_inputs = cts.len();
         let num_blocks_result =
-            (cts.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
+            (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
 
-        let selectors = cts
-            .iter()
-            .map(|ct| self.eq(ct, value, streams))
-            .collect::<Vec<_>>();
+        let mut index_ct: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_blocks_result, streams);
 
-        let packed_selectors =
-            self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams);
+        let trivial_bool =
+            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams);
+        let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner());
 
-        let mut only_first_selectors = self.only_keep_first_true(packed_selectors, streams);
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_unchecked_first_index_of(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        cts,
+                        value.as_ref(),
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_unchecked_first_index_of(
+                        streams,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        cts,
+                        value.as_ref(),
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
 
-        let unpacked_selectors =
-            self.convert_unsigned_radix_ciphertext_to_selectors(&mut only_first_selectors, streams);
-
-        let possible_values = self.create_possible_results(
-            num_blocks_result,
-            unpacked_selectors
-                .into_par_iter()
-                .enumerate()
-                .map(|(i, v)| (v, i as u64)),
-            streams,
-        );
-        let out_ct = self.aggregate_one_hot_vector(&possible_values, streams);
-
-        let block =
-            self.unchecked_is_at_least_one_comparisons_block_true(&only_first_selectors, streams);
-        (out_ct, block)
+        (index_ct, match_ct)
     }
 
     /// Returns the encrypted index of the _first_ occurrence of encrypted `value` in the ciphertext
@@ -1702,106 +1935,32 @@ impl CudaServerKey {
 
     fn compute_final_index_from_selectors(
         &self,
-        selectors: Vec<CudaBooleanBlock>,
+        selectors: &[CudaBooleanBlock],
         streams: &CudaStreams,
     ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock) {
-        let num_blocks_result =
-            (selectors.len().ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
+        let num_inputs = selectors.len();
+        let num_blocks_index =
+            (num_inputs.ilog2() + 1).div_ceil(self.message_modulus.0.ilog2()) as usize;
 
-        let selectors2 = self.convert_selectors_to_unsigned_radix_ciphertext(&selectors, streams);
-        let possible_values = self.create_possible_results(
-            num_blocks_result,
-            selectors
-                .into_par_iter()
-                .enumerate()
-                .map(|(i, v)| (v, i as u64)),
-            streams,
-        );
-        let one_hot_vector = self.aggregate_one_hot_vector(&possible_values, streams);
+        let mut index_ct: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_blocks_index, streams);
 
-        let block = self.unchecked_is_at_least_one_comparisons_block_true(&selectors2, streams);
-
-        (one_hot_vector, block)
-    }
-
-    /// Computes the vector of selectors from an input iterator of clear values and an encrypted
-    /// value
-    ///
-    /// Given an iterator of clear values, and an encrypted radix ciphertext,
-    /// this method will return a vector of encrypted boolean values where
-    /// each value is either 1 if the ct is equal to the corresponding clear in the iterator
-    /// otherwise it will be 0.
-    /// On the GPU after applying many luts the result is stored differently than on the CPU.
-    /// If we have 4 many luts result is stored contiguosly in memory as follows:
-    /// [result many lut 1][result many lut 2][result many lut 3][result many lut 4]
-    /// In this case we need to jump between the results of the many luts to build the final result
-    ///
-    /// Requires ct to have empty carries
-    fn compute_equality_selectors<T, Iter, Clear>(
-        &self,
-        ct: &T,
-        possible_input_values: Iter,
-        streams: &CudaStreams,
-    ) -> Vec<CudaBooleanBlock>
-    where
-        T: CudaIntegerRadixCiphertext,
-        Iter: ParallelIterator<Item = Clear>,
-        Clear: Decomposable + CastInto<usize> + Send + Sync,
-    {
-        assert!(
-            ct.block_carries_are_empty(),
-            "internal error: ciphertext carries must be empty"
-        );
-        assert!(
-            self.carry_modulus.0 >= self.message_modulus.0,
-            "This function uses many LUTs in a way that requires to have at least as much carry \
-            space as message space ({:?} vs {:?})",
-            self.carry_modulus,
-            self.message_modulus
-        );
-
-        let num_bits_in_message = self.message_modulus.0.ilog2();
-        let num_blocks = ct.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
-
-        let clear_values: Vec<Clear> = possible_input_values.collect();
-        let num_possible_values = clear_values.len() as u32;
-
-        if num_possible_values == 0 {
-            return vec![];
-        }
-
-        let h_decomposed_cleartexts: Vec<u64> = clear_values
-            .into_par_iter()
-            .flat_map(|input_value| {
-                BlockDecomposer::new(input_value, num_bits_in_message)
-                    .take(num_blocks as usize)
-                    .map(|block_value| block_value.cast_into() as u64)
-                    .collect::<Vec<_>>()
-            })
-            .collect::<Vec<_>>();
-
-        let mut result_vec: Vec<CudaBooleanBlock> = (0..num_possible_values)
-            .map(|_| {
-                CudaBooleanBlock(
-                    self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
-                )
-            })
-            .collect();
+        let trivial_bool =
+            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams);
+        let mut match_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(trivial_bool.into_inner());
 
         unsafe {
             match &self.bootstrapping_key {
                 CudaBootstrappingKey::Classic(d_bsk) => {
-                    cuda_backend_compute_equality_selectors(
+                    cuda_backend_compute_final_index_from_selectors(
                         streams,
-                        &mut result_vec,
-                        ct.as_ref(),
-                        &h_decomposed_cleartexts,
-                        num_possible_values,
-                        num_blocks,
-                        self.message_modulus,
-                        self.carry_modulus,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        selectors,
                         &d_bsk.d_vec,
                         &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
                         d_bsk.glwe_dimension,
                         d_bsk.polynomial_size,
                         self.key_switching_key
@@ -1820,17 +1979,15 @@ impl CudaServerKey {
                     );
                 }
                 CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    cuda_backend_compute_equality_selectors(
+                    cuda_backend_compute_final_index_from_selectors(
                         streams,
-                        &mut result_vec,
-                        ct.as_ref(),
-                        &h_decomposed_cleartexts,
-                        num_possible_values,
-                        num_blocks,
-                        self.message_modulus,
-                        self.carry_modulus,
+                        index_ct.as_mut(),
+                        &mut match_ct,
+                        selectors,
                         &d_multibit_bsk.d_vec,
                         &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
                         d_multibit_bsk.glwe_dimension,
                         d_multibit_bsk.polynomial_size,
                         self.key_switching_key
@@ -1851,259 +2008,6 @@ impl CudaServerKey {
             }
         }
 
-        result_vec
-    }
-
-    /// Creates a vector of radix ciphertext from an iterator that associates encrypted boolean
-    /// values to clear values.
-    ///
-    /// The elements of the resulting vector are zero if the corresponding BooleanBlock encrypted 0,
-    /// otherwise it encrypts the associated clear value.
-    ///
-    /// This is only really useful if only one of the boolean block is known to be non-zero.
-    ///
-    /// `num_blocks`: number of blocks (unpacked) needed to represent the biggest clear value
-    ///
-    /// - Resulting radix ciphertexts have their block packed, thus they will have ceil (numb_blocks
-    ///   / 2) elements
-    fn create_possible_results<T, Iter, Clear>(
-        &self,
-        num_blocks: usize,
-        possible_outputs: Iter,
-        streams: &CudaStreams,
-    ) -> Vec<T>
-    where
-        T: CudaIntegerRadixCiphertext,
-        Iter: ParallelIterator<Item = (CudaBooleanBlock, Clear)>,
-        Clear: Decomposable + CastInto<usize> + Send + Sync,
-    {
-        assert!(
-            self.carry_modulus.0 >= self.message_modulus.0,
-            "As this function packs blocks, it requires to have at least as much carry \
-            space as message space ({:?} vs {:?})",
-            self.carry_modulus,
-            self.message_modulus
-        );
-
-        let num_bits_in_message = self.message_modulus.0.ilog2();
-        let num_packed_blocks = num_blocks.div_ceil(2) as u32;
-
-        let collected_outputs: Vec<(CudaBooleanBlock, Clear)> = possible_outputs.collect();
-        let num_possible_values = collected_outputs.len();
-
-        if num_possible_values == 0 {
-            return vec![];
-        }
-
-        let (selectors, clear_values): (Vec<CudaBooleanBlock>, Vec<Clear>) =
-            collected_outputs.into_iter().unzip();
-
-        let h_decomposed_cleartexts: Vec<u64> = clear_values
-            .into_par_iter()
-            .flat_map(|input_value| {
-                BlockDecomposer::new(input_value, 2 * num_bits_in_message)
-                    .take(num_packed_blocks as usize)
-                    .map(|block_value| block_value.cast_into() as u64)
-                    .collect::<Vec<_>>()
-            })
-            .collect::<Vec<_>>();
-
-        let mut result_vec: Vec<T> = (0..num_possible_values)
-            .map(|_| self.create_trivial_zero_radix(num_packed_blocks as usize, streams))
-            .collect();
-
-        unsafe {
-            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => {
-                    cuda_backend_create_possible_results(
-                        streams,
-                        &mut result_vec,
-                        &selectors,
-                        &h_decomposed_cleartexts,
-                        num_packed_blocks,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        &d_bsk.d_vec,
-                        &self.key_switching_key.d_vec,
-                        d_bsk.glwe_dimension,
-                        d_bsk.polynomial_size,
-                        self.key_switching_key
-                            .input_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key
-                            .output_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_bsk.decomp_level_count,
-                        d_bsk.decomp_base_log,
-                        PBSType::Classical,
-                        LweBskGroupingFactor(0),
-                        d_bsk.ms_noise_reduction_configuration.as_ref(),
-                    );
-                }
-                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    cuda_backend_create_possible_results(
-                        streams,
-                        &mut result_vec,
-                        &selectors,
-                        &h_decomposed_cleartexts,
-                        num_packed_blocks,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        &d_multibit_bsk.d_vec,
-                        &self.key_switching_key.d_vec,
-                        d_multibit_bsk.glwe_dimension,
-                        d_multibit_bsk.polynomial_size,
-                        self.key_switching_key
-                            .input_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key
-                            .output_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_multibit_bsk.decomp_level_count,
-                        d_multibit_bsk.decomp_base_log,
-                        PBSType::MultiBit,
-                        d_multibit_bsk.grouping_factor,
-                        None,
-                    );
-                }
-            }
-        }
-
-        result_vec
-    }
-
-    /// Aggregate/combines a vec of one-hot vector of radix ciphertexts
-    /// (i.e. at most one of the vector element is non-zero) into single ciphertext
-    /// containing the non-zero value.
-    ///
-    /// The elements in the one hot vector have their block packed.
-    ///
-    /// The returned result has non packed blocks
-    fn aggregate_one_hot_vector<T>(&self, one_hot_vector: &[T], streams: &CudaStreams) -> T
-    where
-        T: CudaIntegerRadixCiphertext,
-    {
-        if one_hot_vector.is_empty() {
-            return self.create_trivial_zero_radix(0, streams);
-        }
-
-        let num_packed_blocks = one_hot_vector[0].as_ref().d_blocks.lwe_ciphertext_count().0;
-        let mut output_ct: T = self.create_trivial_zero_radix(num_packed_blocks * 2, streams);
-
-        unsafe {
-            match &self.bootstrapping_key {
-                CudaBootstrappingKey::Classic(d_bsk) => {
-                    cuda_backend_aggregate_one_hot_vector(
-                        streams,
-                        &mut output_ct,
-                        one_hot_vector,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        &d_bsk.d_vec,
-                        &self.key_switching_key.d_vec,
-                        d_bsk.glwe_dimension,
-                        d_bsk.polynomial_size,
-                        self.key_switching_key
-                            .input_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key
-                            .output_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_bsk.decomp_level_count,
-                        d_bsk.decomp_base_log,
-                        PBSType::Classical,
-                        LweBskGroupingFactor(0),
-                        d_bsk.ms_noise_reduction_configuration.as_ref(),
-                    );
-                }
-                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
-                    cuda_backend_aggregate_one_hot_vector(
-                        streams,
-                        &mut output_ct,
-                        one_hot_vector,
-                        self.message_modulus,
-                        self.carry_modulus,
-                        &d_multibit_bsk.d_vec,
-                        &self.key_switching_key.d_vec,
-                        d_multibit_bsk.glwe_dimension,
-                        d_multibit_bsk.polynomial_size,
-                        self.key_switching_key
-                            .input_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key
-                            .output_key_lwe_size()
-                            .to_lwe_dimension(),
-                        self.key_switching_key.decomposition_level_count(),
-                        self.key_switching_key.decomposition_base_log(),
-                        d_multibit_bsk.decomp_level_count,
-                        d_multibit_bsk.decomp_base_log,
-                        PBSType::MultiBit,
-                        d_multibit_bsk.grouping_factor,
-                        None,
-                    );
-                }
-            }
-        }
-
-        output_ct
-    }
-
-    /// Only keeps at most one Ciphertext that encrypts 1
-    ///
-    /// Given a Vec of Ciphertexts where each Ciphertext encrypts 0 or 1
-    /// This function will return a Vec of Ciphertext where at most one encryption of 1 is present
-    /// The first encryption of one is kept
-    fn only_keep_first_true<T>(&self, values: T, streams: &CudaStreams) -> T
-    where
-        T: CudaIntegerRadixCiphertext,
-    {
-        let num_ct_blocks = values.as_ref().d_blocks.lwe_ciphertext_count().0;
-        if num_ct_blocks <= 1 {
-            return values;
-        }
-        const ALREADY_SEEN: u64 = 2;
-        let lut_fn = self.generate_lookup_table_bivariate(|current, previous| {
-            if previous == 1 || previous == ALREADY_SEEN {
-                ALREADY_SEEN
-            } else {
-                current
-            }
-        });
-
-        let mut first_true: T = self.create_trivial_zero_radix(num_ct_blocks, streams);
-
-        let mut clone_ct = values.duplicate(streams);
-        self.compute_prefix_sum_hillis_steele(
-            first_true.as_mut(),
-            clone_ct.as_mut(),
-            &lut_fn,
-            0..num_ct_blocks,
-            streams,
-        );
-
-        let lut = self.generate_lookup_table(|x| {
-            let x = x % self.message_modulus.0;
-            if x == ALREADY_SEEN {
-                0
-            } else {
-                x
-            }
-        });
-
-        let cloned_ct = first_true.duplicate(streams);
-        self.apply_lookup_table(
-            first_true.as_mut(),
-            cloned_ct.as_ref(),
-            &lut,
-            0..num_ct_blocks,
-            streams,
-        );
-        first_true
+        (index_ct, match_ct)
     }
 }