fix(gpu): fix match value parallelization

2026-04-28 03:01:21 -04:00 · 2026-04-27 15:56:02 +02:00
8 changed files with 1105 additions and 962 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -753,8 +753,9 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_match_value_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out_result,
@@ -793,9 +794,9 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
-    uint32_t max_output_is_zero, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t max_output_is_zero, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_match_value_or_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
@@ -812,9 +813,9 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
                                      CudaRadixCiphertextFFI *output,
@@ -832,9 +833,9 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_contains_clear_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
@@ -850,9 +851,9 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t num_clears, uint32_t num_blocks, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_is_in_clears_64_async(CudaStreamsFFI streams,
                                          CudaRadixCiphertextFFI *output,
@@ -871,8 +872,9 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_in_clears_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -890,8 +892,9 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_in_clears_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -913,8 +916,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_of_clear_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -932,8 +936,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_first_index_of_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
@@ -951,8 +956,9 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
                                      CudaRadixCiphertextFFI *index_ct,
@@ -972,16 +978,16 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_unchecked_index_of_clear_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
-    const void *d_scalar_blocks, bool is_scalar_obviously_bigger,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
-    uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
-    void *const *ksks);
+    const uint64_t *h_clear_val, bool is_scalar_obviously_bigger,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    int8_t *mem, void *const *bsks, void *const *ksks);

 void cleanup_cuda_unchecked_index_of_clear_64(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cu
@@ -7,8 +7,9 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_output_packed_blocks, uint32_t max_output_is_zero,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -18,7 +19,7 @@ uint64_t scratch_cuda_unchecked_match_value_64_async(
  return scratch_cuda_unchecked_match_value<uint64_t>(
      CudaStreams(streams), (int_unchecked_match_buffer<uint64_t> **)mem_ptr,
      params, num_matches, num_input_blocks, num_output_packed_blocks,
-      max_output_is_zero, allocate_gpu_memory);
+      max_output_is_zero, match_parallelism, allocate_gpu_memory);
 }

 void cuda_unchecked_match_value_64_async(
@@ -62,9 +63,9 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_matches, uint32_t num_input_blocks,
    uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
-    uint32_t max_output_is_zero, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t max_output_is_zero, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -75,7 +76,7 @@ uint64_t scratch_cuda_unchecked_match_value_or_64_async(
      CudaStreams(streams),
      (int_unchecked_match_value_or_buffer<uint64_t> **)mem_ptr, params,
      num_matches, num_input_blocks, num_match_packed_blocks, num_final_blocks,
-      max_output_is_zero, allocate_gpu_memory);
+      max_output_is_zero, match_parallelism, allocate_gpu_memory);
 }

 void cuda_unchecked_match_value_or_64_async(
@@ -111,9 +112,9 @@ uint64_t scratch_cuda_unchecked_contains_64_async(
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -122,7 +123,7 @@ uint64_t scratch_cuda_unchecked_contains_64_async(

  return scratch_cuda_unchecked_contains<uint64_t>(
      CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
-      params, num_inputs, num_blocks, allocate_gpu_memory);
+      params, num_inputs, num_blocks, match_parallelism, allocate_gpu_memory);
 }

 void cuda_unchecked_contains_64_async(CudaStreamsFFI streams,
@@ -161,9 +162,9 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -173,7 +174,7 @@ uint64_t scratch_cuda_unchecked_contains_clear_64_async(
  return scratch_cuda_unchecked_contains_clear<uint64_t>(
      CudaStreams(streams),
      (int_unchecked_contains_clear_buffer<uint64_t> **)mem_ptr, params,
-      num_inputs, num_blocks, allocate_gpu_memory);
+      num_inputs, num_blocks, match_parallelism, allocate_gpu_memory);
 }

 void cuda_unchecked_contains_clear_64_async(
@@ -206,9 +207,9 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_clears, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t num_clears, uint32_t num_blocks, uint32_t match_parallelism,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -218,7 +219,7 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64_async(
  return scratch_cuda_unchecked_is_in_clears<uint64_t>(
      CudaStreams(streams),
      (int_unchecked_is_in_clears_buffer<uint64_t> **)mem_ptr, params,
-      num_clears, num_blocks, allocate_gpu_memory);
+      num_clears, num_blocks, match_parallelism, allocate_gpu_memory);
 }

 void cuda_unchecked_is_in_clears_64_async(
@@ -252,8 +253,9 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_clears, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -263,7 +265,8 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64_async(
  return scratch_cuda_unchecked_index_in_clears<uint64_t>(
      CudaStreams(streams),
      (int_unchecked_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
-      num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
+      num_clears, num_blocks, num_blocks_index, match_parallelism,
+      allocate_gpu_memory);
 }

 void cuda_unchecked_index_in_clears_64_async(
@@ -304,8 +307,9 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -315,7 +319,8 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64_async(
  return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
      CudaStreams(streams),
      (int_unchecked_first_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
-      num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
+      num_unique, num_blocks, num_blocks_index, match_parallelism,
+      allocate_gpu_memory);
 }

 void cuda_unchecked_first_index_in_clears_64_async(
@@ -356,8 +361,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -367,7 +373,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64_async(
  return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
      CudaStreams(streams),
      (int_unchecked_first_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
-      num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
+      num_inputs, num_blocks, num_blocks_index, match_parallelism,
+      allocate_gpu_memory);
 }

 void cuda_unchecked_first_index_of_clear_64_async(
@@ -408,8 +415,9 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -419,7 +427,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_64_async(
  return scratch_cuda_unchecked_first_index_of<uint64_t>(
      CudaStreams(streams),
      (int_unchecked_first_index_of_buffer<uint64_t> **)mem_ptr, params,
-      num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
+      num_inputs, num_blocks, num_blocks_index, match_parallelism,
+      allocate_gpu_memory);
 }

 void cuda_unchecked_first_index_of_64_async(
@@ -460,8 +469,9 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -470,7 +480,8 @@ uint64_t scratch_cuda_unchecked_index_of_64_async(

  return scratch_cuda_unchecked_index_of<uint64_t>(
      CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
-      params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
+      params, num_inputs, num_blocks, num_blocks_index, match_parallelism,
+      allocate_gpu_memory);
 }

 void cuda_unchecked_index_of_64_async(CudaStreamsFFI streams,
@@ -513,8 +524,9 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t match_parallelism, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -524,16 +536,16 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64_async(
  return scratch_cuda_unchecked_index_of_clear<uint64_t>(
      CudaStreams(streams),
      (int_unchecked_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
-      num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
+      num_inputs, num_blocks, num_blocks_index, match_parallelism,
+      allocate_gpu_memory);
 }

 void cuda_unchecked_index_of_clear_64_async(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *index_ct,
    CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
-    const void *d_scalar_blocks, bool is_scalar_obviously_bigger,
-    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
-    uint32_t num_blocks_index, int8_t *mem, void *const *bsks,
-    void *const *ksks) {
+    const uint64_t *h_clear_val, bool is_scalar_obviously_bigger,
+    uint32_t num_inputs, uint32_t num_blocks, uint32_t num_blocks_index,
+    int8_t *mem, void *const *bsks, void *const *ksks) {
  PANIC_IF_FALSE(index_ct != inputs, "Output and input pointers must be "
                                     "different for out-of-place operations");
  PANIC_IF_FALSE(match_ct != inputs, "Output and input pointers must be "
@@ -543,9 +555,8 @@ void cuda_unchecked_index_of_clear_64_async(
                 "out-of-place operations");

  host_unchecked_index_of_clear<uint64_t>(
-      CudaStreams(streams), index_ct, match_ct, inputs,
-      (const uint64_t *)d_scalar_blocks, is_scalar_obviously_bigger, num_inputs,
-      num_blocks, num_scalar_blocks, num_blocks_index,
+      CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val,
+      is_scalar_obviously_bigger, num_inputs, num_blocks, num_blocks_index,
      (int_unchecked_index_of_clear_buffer<uint64_t> *)mem, bsks,
      (uint64_t *const *)ksks);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1711,6 +1711,7 @@ unsafe extern "C" {
        num_input_blocks: u32,
        num_output_packed_blocks: u32,
        max_output_is_zero: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -1794,6 +1795,7 @@ unsafe extern "C" {
        num_match_packed_blocks: u32,
        num_final_blocks: u32,
        max_output_is_zero: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -1835,6 +1837,7 @@ unsafe extern "C" {
        grouping_factor: u32,
        num_inputs: u32,
        num_blocks: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -1873,6 +1876,7 @@ unsafe extern "C" {
        grouping_factor: u32,
        num_inputs: u32,
        num_blocks: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -1914,6 +1918,7 @@ unsafe extern "C" {
        grouping_factor: u32,
        num_clears: u32,
        num_blocks: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -1956,6 +1961,7 @@ unsafe extern "C" {
        num_clears: u32,
        num_blocks: u32,
        num_blocks_index: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -2000,6 +2006,7 @@ unsafe extern "C" {
        num_unique: u32,
        num_blocks: u32,
        num_blocks_index: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -2054,6 +2061,7 @@ unsafe extern "C" {
        num_inputs: u32,
        num_blocks: u32,
        num_blocks_index: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -2098,6 +2106,7 @@ unsafe extern "C" {
        num_inputs: u32,
        num_blocks: u32,
        num_blocks_index: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -2142,6 +2151,7 @@ unsafe extern "C" {
        num_inputs: u32,
        num_blocks: u32,
        num_blocks_index: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -2183,6 +2193,7 @@ unsafe extern "C" {
        num_inputs: u32,
        num_blocks: u32,
        num_blocks_index: u32,
+        match_parallelism: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -2196,11 +2207,10 @@ unsafe extern "C" {
        index_ct: *mut CudaRadixCiphertextFFI,
        match_ct: *mut CudaRadixCiphertextFFI,
        inputs: *const CudaRadixCiphertextFFI,
-        d_scalar_blocks: *const ffi::c_void,
+        h_clear_val: *const u64,
        is_scalar_obviously_bigger: bool,
        num_inputs: u32,
        num_blocks: u32,
-        num_scalar_blocks: u32,
        num_blocks_index: u32,
        mem: *mut i8,
        bsks: *const *mut ffi::c_void,
--- a/tfhe/src/high_level_api/integers/unsigned/base.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/base.rs
@@ -1265,7 +1265,7 @@ where
                let size = cuda_key
                    .key
                    .key
-                    .get_unchecked_match_value_size_on_gpu(&ct_on_gpu, matches, streams);
+                    .get_unchecked_match_value_size_on_gpu(&ct_on_gpu, matches, 1, streams);
                Ok(size)
            }
            #[cfg(feature = "hpu")]
--- a/tfhe/src/integer/gpu/ffi.rs
+++ b/tfhe/src/integer/gpu/ffi.rs
@@ -8336,6 +8336,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -8480,6 +8481,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value<
        num_input_blocks,
        num_output_packed_blocks,
        max_output_is_zero as u32,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -8531,6 +8533,7 @@ pub(crate) fn cuda_backend_get_unchecked_match_value_size_on_gpu<Clear>(
    message_modulus: MessageModulus,
    carry_modulus: CarryModulus,
    pbs_type: PBSType,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) -> u64
 where
@@ -8579,6 +8582,7 @@ where
            num_input_blocks,
            num_output_packed_blocks,
            max_output_is_zero as u32,
+            match_parallelism,
            u32::try_from(message_modulus.0).unwrap(),
            u32::try_from(carry_modulus.0).unwrap(),
            pbs_type as u32,
@@ -8673,6 +8677,7 @@ where
            num_match_packed_blocks,
            num_output_blocks,
            max_output_is_zero as u32,
+            1,
            u32::try_from(message_modulus.0).unwrap(),
            u32::try_from(carry_modulus.0).unwrap(),
            pbs_type as u32,
@@ -8808,6 +8813,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -8935,6 +8941,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_match_value_or<
        num_match_packed_blocks,
        num_final_blocks,
        max_output_is_zero as u32,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -8988,6 +8995,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9058,6 +9066,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains<
        u32::try_from(grouping_factor.0).unwrap(),
        num_inputs,
        num_blocks,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -9112,6 +9121,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains_clear<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9188,6 +9198,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_contains_clear<
        u32::try_from(grouping_factor.0).unwrap(),
        num_inputs,
        num_blocks,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -9241,6 +9252,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9292,6 +9304,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_in_clears<
        u32::try_from(grouping_factor.0).unwrap(),
        num_clears,
        num_blocks,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -9346,6 +9359,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9414,6 +9428,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_in_clears<
        num_clears,
        num_blocks,
        num_blocks_index,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -9471,6 +9486,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9564,6 +9580,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_in_clears<
        num_unique,
        num_blocks,
        num_blocks_index,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -9623,6 +9640,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9710,6 +9728,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of_clear<
        num_inputs,
        num_blocks,
        num_blocks_index,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -9767,6 +9786,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9854,6 +9874,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_first_index_of<
        num_inputs,
        num_blocks,
        num_blocks_index,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -9911,6 +9932,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -9998,6 +10020,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of<
        num_inputs,
        num_blocks,
        num_blocks_index,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -10056,6 +10079,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
    pbs_base_log: DecompositionBaseLog,
    pbs_type: PBSType,
    grouping_factor: LweBskGroupingFactor,
+    match_parallelism: u32,
    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
    assert_eq!(streams.gpu_indexes[0], bootstrapping_key.gpu_index(0));
@@ -10076,9 +10100,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
        .is_some_and(|sub_slice| sub_slice.iter().any(|&scalar_block| scalar_block != 0));

    scalar_blocks.truncate(num_blocks_in_ct as usize);
-    let num_scalar_blocks = u32::try_from(scalar_blocks.len()).unwrap();
-
-    let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, streams, 0);
+    scalar_blocks.resize(num_blocks_in_ct as usize, 0u64);

    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);

@@ -10152,6 +10174,7 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
        num_inputs,
        num_blocks_in_ct,
        num_blocks_index,
+        match_parallelism,
        u32::try_from(message_modulus.0).unwrap(),
        u32::try_from(carry_modulus.0).unwrap(),
        pbs_type as u32,
@@ -10164,11 +10187,10 @@ pub(crate) unsafe fn cuda_backend_unchecked_index_of_clear<
        &raw mut ffi_index,
        &raw mut ffi_match,
        ffi_inputs.as_ptr(),
-        d_scalar_blocks.as_c_ptr(0),
+        scalar_blocks.as_ptr(),
        is_scalar_obviously_bigger,
        num_inputs,
        num_blocks_in_ct,
-        num_scalar_blocks,
        num_blocks_index,
        mem_ptr,
        bootstrapping_key.ptr.as_ptr(),
--- a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs
@@ -27,16 +27,22 @@ impl CudaServerKey {
        streams: &CudaStreams,
    ) -> (CudaUnsignedRadixCiphertext, CudaBooleanBlock)
    where
-        Clear: UnsignedInteger + DecomposableInto<u64> + CastInto<usize>,
+        Clear:
+            UnsignedInteger + DecomposableInto<u64> + CastInto<usize> + CastInto<u64> + Sync + Send,
    {
-        if matches.get_values().is_empty() {
-            let trivial_ct: CudaUnsignedRadixCiphertext = self.create_trivial_radix(0, 1, streams);
-            let trivial_bool = CudaBooleanBlock::from_cuda_radix_ciphertext(
-                trivial_ct.duplicate(streams).into_inner(),
+        let num_matches = matches.get_values().len();
+
+        if num_matches == 0 {
+            let result_ct: CudaUnsignedRadixCiphertext = self.create_trivial_zero_radix(1, streams);
+            let result_bool: CudaBooleanBlock = CudaBooleanBlock(
+                self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
            );
-            return (trivial_ct, trivial_bool);
+            return (result_ct, result_bool);
        }

+        let match_parallelism = num_matches as u32;
+
+        let num_bits_in_message = self.message_modulus.0.ilog2();
        let max_output_value = matches
            .get_values()
            .iter()
@@ -44,13 +50,17 @@ impl CudaServerKey {
            .max_by(|(_, outputl), (_, outputr)| outputl.cmp(outputr))
            .expect("luts is not empty at this point")
            .1;
+        let max_val_u64: u64 = max_output_value.cast_into();
+        let num_output_unpacked_blocks = if max_val_u64 == 0 {
+            1
+        } else {
+            (max_val_u64.ilog2() + 1).div_ceil(num_bits_in_message)
+        };

-        let num_output_unpacked_blocks =
-            self.num_blocks_to_represent_unsigned_value(max_output_value);
+        let mut result: CudaUnsignedRadixCiphertext =
+            self.create_trivial_zero_radix(num_output_unpacked_blocks as usize, streams);

-        let mut result_ct: CudaUnsignedRadixCiphertext =
-            self.create_trivial_zero_radix(num_output_unpacked_blocks, streams);
-        let mut result_bool: CudaBooleanBlock = CudaBooleanBlock(
+        let mut boolean_result: CudaBooleanBlock = CudaBooleanBlock(
            self.create_trivial_zero_radix::<CudaUnsignedRadixCiphertext>(1, streams),
        );

@@ -63,8 +73,8 @@ impl CudaServerKey {
                CudaBootstrappingKey::Classic(d_bsk) => {
                    cuda_backend_unchecked_match_value(
                        streams,
-                        &mut result_ct,
-                        &mut result_bool,
+                        &mut result,
+                        &mut boolean_result,
                        ct.as_ref(),
                        matches,
                        self.message_modulus,
@@ -81,14 +91,15 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
                    cuda_backend_unchecked_match_value(
                        streams,
-                        &mut result_ct,
-                        &mut result_bool,
+                        &mut result,
+                        &mut boolean_result,
                        ct.as_ref(),
                        matches,
                        self.message_modulus,
@@ -105,19 +116,21 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
            }
        }

-        (result_ct, result_bool)
+        (result, boolean_result)
    }

    pub fn get_unchecked_match_value_size_on_gpu<Clear>(
        &self,
        ct: &CudaUnsignedRadixCiphertext,
        matches: &MatchValues<Clear>,
+        match_parallelism: u32,
        streams: &CudaStreams,
    ) -> u64
    where
@@ -150,6 +163,7 @@ impl CudaServerKey {
                    self.message_modulus,
                    self.carry_modulus,
                    PBSType::Classical,
+                    match_parallelism,
                    d_bsk.ms_noise_reduction_configuration.as_ref(),
                )
            }
@@ -170,6 +184,7 @@ impl CudaServerKey {
                    self.message_modulus,
                    self.carry_modulus,
                    PBSType::MultiBit,
+                    match_parallelism,
                    None,
                )
            }
@@ -294,6 +309,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = matches.get_values().len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -317,6 +334,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -341,6 +359,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -509,6 +528,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = cts.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -531,6 +552,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -554,6 +576,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -660,6 +683,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = cts.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -682,6 +707,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -705,6 +731,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -802,6 +829,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = clears.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -824,6 +853,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -847,6 +877,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -957,6 +988,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = clears.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -980,6 +1013,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -1004,6 +1038,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -1124,6 +1159,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = clears.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1147,6 +1184,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -1171,6 +1209,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -1278,6 +1317,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = cts.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1301,6 +1342,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -1325,6 +1367,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -1461,6 +1504,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = cts.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1484,6 +1529,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -1508,6 +1554,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -1633,6 +1680,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = cts.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1656,6 +1705,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -1680,6 +1730,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }
@@ -1803,6 +1854,8 @@ impl CudaServerKey {
            panic!("Only the standard atomic pattern is supported on GPU")
        };

+        let match_parallelism = cts.len() as u32;
+
        unsafe {
            match &self.bootstrapping_key {
                CudaBootstrappingKey::Classic(d_bsk) => {
@@ -1826,6 +1879,7 @@ impl CudaServerKey {
                        d_bsk.decomp_base_log,
                        PBSType::Classical,
                        LweBskGroupingFactor(0),
+                        match_parallelism,
                        d_bsk.ms_noise_reduction_configuration.as_ref(),
                    );
                }
@@ -1850,6 +1904,7 @@ impl CudaServerKey {
                        d_multibit_bsk.decomp_base_log,
                        PBSType::MultiBit,
                        d_multibit_bsk.grouping_factor,
+                        match_parallelism,
                        None,
                    );
                }