fix(gpu): disable cache

fix(gpu): remove broadcast
fix(gpu): test remove sync on lut create
2026-04-28 03:01:21 -04:00 · 2026-01-23 15:50:37 +01:00 · 2026-01-23 13:00:44 +01:00 · 2026-01-23 11:27:47 +01:00 · 2026-01-22 16:56:14 +01:00 · 2026-01-22 10:04:39 +01:00
16 changed files with 531 additions and 732 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -29,15 +29,13 @@ template <typename Torus> struct int_aes_lut_buffers {
        allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return a & b; };
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
-        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, and_lambda, allocate_gpu_memory);
+
    auto active_streams_and_lut = streams.active_gpu_subset(
        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
        params.pbs_type);
-    this->and_lut->broadcast_lut(active_streams_and_lut);
+    this->and_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
+
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->flush_lut = new int_radix_lut<Torus>(
@@ -46,14 +44,11 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
      return x & 1;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
-        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+
    auto active_streams_flush_lut = streams.active_gpu_subset(
        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
-    this->flush_lut->broadcast_lut(active_streams_flush_lut);
+    this->flush_lut->generate_and_broadcast_lut(
+        active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->carry_lut = new int_radix_lut<Torus>(
@@ -61,14 +56,11 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
      return (x >> 1) & 1;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
-        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_lambda, allocate_gpu_memory);
+
    auto active_streams_carry_lut =
        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
-    this->carry_lut->broadcast_lut(active_streams_carry_lut);
+    this->carry_lut->generate_and_broadcast_lut(
+        active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -65,14 +65,8 @@ template <typename Torus> struct boolean_bitop_buffer {
        return x % params.message_modulus;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          message_extract_lut->get_lut(0, 0),
-          message_extract_lut->get_degree(0),
-          message_extract_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_message_extract, gpu_memory_allocated);
-      message_extract_lut->broadcast_lut(active_streams);
+      message_extract_lut->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }
    tmp_lwe_left = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -142,12 +136,8 @@ template <typename Torus> struct int_bitop_buffer {
          }
        };

-        generate_device_accumulator_bivariate<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+        lut->generate_and_broadcast_bivariate_lut(
+            active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
      }
      break;
    default:
@@ -156,6 +146,8 @@ template <typename Torus> struct int_bitop_buffer {
                                     num_radix_blocks, allocate_gpu_memory,
                                     size_tracker);

+      std::vector<std::function<Torus(Torus)>> lut_funcs;
+      std::vector<uint32_t> lut_indices;
      for (int i = 0; i < params.message_modulus; i++) {
        auto rhs = i;

@@ -171,14 +163,13 @@ template <typename Torus> struct int_bitop_buffer {
            return x ^ rhs;
          }
        };
-        generate_device_accumulator<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
-            lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_univariate_scalar_f,
-            gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+
+        lut_funcs.push_back(lut_univariate_scalar_f);
+        lut_indices.push_back(i);
      }
+
+      lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
+                                      gpu_memory_allocated);
    }
  }

@@ -211,16 +202,11 @@ template <typename Torus> struct boolean_bitnot_buffer {
        return x % message_modulus;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          message_extract_lut->get_lut(0, 0),
-          message_extract_lut->get_degree(0),
-          message_extract_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_message_extract, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
-      message_extract_lut->broadcast_lut(active_streams);
+
+      message_extract_lut->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -28,21 +28,17 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
      uint32_t bits_per_block = std::log2(params.message_modulus);
      uint32_t msg_modulus = params.message_modulus;

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          [msg_modulus, bits_per_block](Torus x) {
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+
+      lut->generate_and_broadcast_lut(
+          active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
            const auto xm = x % msg_modulus;
            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
            return (Torus)((msg_modulus - 1) * sign_bit);
-          },
+          }},
          allocate_gpu_memory);

-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      lut->broadcast_lut(active_streams);
-
      this->last_block = new CudaRadixCiphertextFFI;

      create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -85,24 +85,6 @@ template <typename Torus> struct int_cmux_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
-        predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
-        predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
-        message_extract_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        message_extract_lut_f, gpu_memory_allocated);
    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
    for (int index = 0; index < 2 * num_radix_blocks; index++) {
      if (index < num_radix_blocks) {
@@ -115,12 +97,18 @@ template <typename Torus> struct int_cmux_buffer {
        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
+
    auto active_streams_pred =
        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
-    predicate_lut->broadcast_lut(active_streams_pred);
+    predicate_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
+        gpu_memory_allocated);
+
    auto active_streams_msg =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    message_extract_lut->broadcast_lut(active_streams_msg);
+
+    message_extract_lut->generate_and_broadcast_lut(
+        active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -39,22 +39,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

-    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
-                                            allocate_gpu_memory, size_tracker);
-    auto is_max_value_f = [max_value](Torus x) -> Torus {
-      return x == max_value;
-    };
    preallocated_h_lut = (Torus *)malloc(
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
-        is_max_value->get_degree(0), is_max_value->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_max_value_f, gpu_memory_allocated);
+
+    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
+                                            allocate_gpu_memory, size_tracker);

    auto active_streams =
        streams.active_gpu_subset(max_chunks, params.pbs_type);
-    is_max_value->broadcast_lut(active_streams);
+
+    auto is_max_value_f = [max_value](Torus x) -> Torus {
+      return x == max_value;
+    };
+
+    is_max_value->generate_and_broadcast_lut(
+        active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
@@ -103,15 +102,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
-        is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
-
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    is_non_zero_lut->broadcast_lut(active_streams);
+    is_non_zero_lut->generate_and_broadcast_lut(
+        active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -129,32 +123,28 @@ template <typename Torus> struct int_comparison_eq_buffer {
        return (lhs == rhs);
      }
    };
+
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < total_modulus; i++) {
      auto lut_f = [i, operator_f](Torus x) -> Torus {
        return operator_f(i, x);
      };
-
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          scalar_comparison_luts->get_lut(0, i),
-          scalar_comparison_luts->get_degree(i),
-          scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f, gpu_memory_allocated);
+      lut_funcs.push_back(lut_f);
+      lut_indices.push_back(i);
    }
-    scalar_comparison_luts->broadcast_lut(active_streams);
+
+    scalar_comparison_luts->generate_and_broadcast_lut(
+        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
+
    if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
      operator_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
-          operator_lut->get_degree(0), operator_lut->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, operator_f, gpu_memory_allocated);
-
-      operator_lut->broadcast_lut(active_streams);
+      operator_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {operator_f}, gpu_memory_allocated);
+      // operator_lut->broadcast_lut(active_streams);
    } else {
      operator_lut = nullptr;
    }
@@ -221,9 +211,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    // LUTs
-    tree_inner_leaf_lut =
-        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
-                                 allocate_gpu_memory, size_tracker);

    tree_last_leaf_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -234,15 +221,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
-        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        block_selector_f, gpu_memory_allocated);
+    tree_inner_leaf_lut =
+        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
+                                 allocate_gpu_memory, size_tracker);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    tree_inner_leaf_lut->broadcast_lut(active_streams);
+    tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
  }

  void release(CudaStreams streams) {
@@ -426,12 +412,8 @@ template <typename Torus> struct int_comparison_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
-        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
-    identity_lut->broadcast_lut(active_streams);
+    identity_lut->generate_and_broadcast_lut(
+        active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -441,13 +423,8 @@ template <typename Torus> struct int_comparison_buffer {
    is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                           allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
-        is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_zero_f, gpu_memory_allocated);
-
-    is_zero_lut->broadcast_lut(active_streams);
+    is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
+                                            gpu_memory_allocated);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -522,13 +499,9 @@ template <typename Torus> struct int_comparison_buffer {
        PANIC("Cuda error: sign_lut creation failed due to wrong function.")
      };

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
-          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      signed_lut->broadcast_lut(active_streams);
+      signed_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
    }
    preallocated_h_lut = (Torus *)malloc(
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -283,12 +283,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                     zero_out_if_not_1_lut_2};
    size_t lut_gpu_indexes[2] = {0, 3};
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(lut_gpu_indexes[j]),
-          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
+      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
+                                          {0}, {zero_out_if_not_1_lut_f},
+                                          gpu_memory_allocated);
    }

    luts[0] = zero_out_if_not_2_lut_1;
@@ -296,12 +293,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    lut_gpu_indexes[0] = 1;
    lut_gpu_indexes[1] = 2;
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(lut_gpu_indexes[j]),
-          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
+      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
+                                          {0}, {zero_out_if_not_2_lut_f},
+                                          gpu_memory_allocated);
    }

    quotient_lut_1 =
@@ -321,21 +315,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    };
    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };

-    generate_device_accumulator<Torus>(
-        streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
-        quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
-        quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
-        quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
+    quotient_lut_1->generate_and_broadcast_lut(
+        streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
+    quotient_lut_2->generate_and_broadcast_lut(
+        streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
+    quotient_lut_3->generate_and_broadcast_lut(
+        streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);

    message_extract_lut_1 = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -350,15 +335,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    luts[0] = message_extract_lut_1;
    luts[1] = message_extract_lut_2;

+    auto active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);
+
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      auto active_streams =
-          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      luts[j]->broadcast_lut(active_streams);
+      luts[j]->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }
  }

@@ -1007,24 +989,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      masking_luts_2[i] = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
-          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
-      masking_luts_1[i]->broadcast_lut(active_streams_1);
+      masking_luts_1[i]->generate_and_broadcast_lut(
+          active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
-          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
      auto active_streams_2 =
          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      masking_luts_2[i]->broadcast_lut(active_streams_2);
+      masking_luts_2[i]->generate_and_broadcast_lut(
+          active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1042,15 +1014,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
+
    auto active_streams =
        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      luts[j]->broadcast_lut(active_streams);
+      luts[j]->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
    }

    // Give name to closures to improve readability
@@ -1141,14 +1110,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          merge_overflow_flags_luts[i]->get_lut(0, 0),
-          merge_overflow_flags_luts[i]->get_degree(0),
-          merge_overflow_flags_luts[i]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_bit, gpu_memory_allocated);
-      merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
+      merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
+          active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
    }
  }

@@ -1557,16 +1520,12 @@ template <typename Torus> struct int_div_rem_memory {
      compare_signed_bits_lut = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          compare_signed_bits_lut->get_lut(0, 0),
-          compare_signed_bits_lut->get_degree(0),
-          compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
-      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
+
+      compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
+          active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
+          gpu_memory_allocated);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -53,13 +53,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return count;
    };

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
-        univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
-
-    univ_lut_mem->broadcast_lut(active_streams);
+    univ_lut_mem->generate_and_broadcast_lut(
+        active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);

    auto generate_bi_lut_lambda =
        [num_bits](Torus block_num_bit_count,
@@ -70,13 +65,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return 0;
    };

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
-        biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
-
-    biv_lut_mem->broadcast_lut(active_streams);
+    biv_lut_mem->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);

    this->tmp_ct = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -232,7 +222,7 @@ template <typename Torus> struct int_ilog2_buffer {
        this->sum_output_not_propagated, counter_num_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

-    this->lut_message_not =
+    lut_message_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
                                 allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus)> lut_message_lambda =
@@ -240,16 +230,11 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t message = x % this->params.message_modulus;
      return (~message) % this->params.message_modulus;
    };
-    generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
-                                this->lut_message_not->get_lut(0, 0),
-                                this->lut_message_not->get_degree(0),
-                                this->lut_message_not->get_max_degree(0),
-                                params.glwe_dimension, params.polynomial_size,
-                                params.message_modulus, params.carry_modulus,
-                                lut_message_lambda, allocate_gpu_memory);
+
    auto active_streams =
        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
-    lut_message_not->broadcast_lut(active_streams);
+    lut_message_not->generate_and_broadcast_lut(
+        active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);

    this->lut_carry_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -259,13 +244,8 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t carry = x / this->params.message_modulus;
      return (~carry) % this->params.message_modulus;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0),
-        this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
-        this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        lut_carry_lambda, allocate_gpu_memory);
-    lut_carry_not->broadcast_lut(active_streams);
+    lut_carry_not->generate_and_broadcast_lut(
+        active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);

    this->message_blocks_not = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -9,6 +9,7 @@
 #include "utils/helper_multi_gpu.cuh"
 #include <cmath>
 #include <functional>
+#include <map>
 #include <queue>

 #include <stdio.h>
@@ -835,6 +836,56 @@ struct int_radix_lut_custom_input_output {
    }
  }

+  void generate_and_broadcast_lut(
+      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
+      std::vector<std::function<OutputTorus(OutputTorus)>> f,
+      bool gpu_memory_allocated) {
+    // streams should be a subset of active_streams
+
+    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
+      generate_device_accumulator<OutputTorus>(
+          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
+          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, f[i], gpu_memory_allocated);
+    }
+   //broadcast_lut(streams);
+  }
+
+  void generate_and_broadcast_bivariate_lut(
+      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
+      std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
+      bool gpu_memory_allocated) {
+    // streams should be a subset of active_streams
+
+    /*    for (int fidx = 0; fidx < f.size(); ++fidx) {
+          __int128_t f_hash = 0;
+          uint32_t bits_per_lut_val = 5;
+          uint32_t input_modulus_sup =
+              params.message_modulus * params.carry_modulus;
+          for (uint32_t i = 0; i < input_modulus_sup; ++i) {
+            OutputTorus f_eval =
+                f[fidx](i / params.message_modulus, i % params.message_modulus);
+            GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
+                       "LUT value expected bitwidth overflow");
+            f_hash |= f_eval;
+            f_hash <<= bits_per_lut_val;
+          }
+          printf("%016llX%016llX\n",
+                 (unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
+                 (unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
+        }
+    */
+    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
+      generate_device_accumulator_bivariate<InputTorus>(
+          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
+          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, f[i], gpu_memory_allocated);
+    }
+    //broadcast_lut(streams);
+  }
+
  void release(CudaStreams streams) {
    PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
                   "Lut vec and Lut vec indexes must have the same size");
@@ -985,18 +1036,15 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                                   bits_per_block * num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

+    std::vector<std::function<Torus(Torus)>> lut_funs;
+    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < bits_per_block; i++) {
-
      auto operator_f = [i, final_offset](Torus x) -> Torus {
        Torus y = (x >> i) & 1;
        return y << final_offset;
      };
-
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
-          lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          operator_f, gpu_memory_allocated);
+      lut_funs.push_back(operator_f);
+      lut_indices.push_back(i);
    }

    /**
@@ -1015,7 +1063,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {

    auto active_streams = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);
-    lut->broadcast_lut(active_streams);
+
+    lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
+                                    gpu_memory_allocated);
+    // lut->broadcast_lut(active_streams);

    /**
     * the input indexes should take the first bits_per_block PBS to target
@@ -1091,24 +1142,6 @@ template <typename Torus> struct int_fullprop_buffer {
    };

    //
-    Torus *lut_buffer_message = lut->get_lut(0, 0);
-    uint64_t *message_degree = lut->get_degree(0);
-    uint64_t *message_max_degree = lut->get_max_degree(0);
-    Torus *lut_buffer_carry = lut->get_lut(0, 1);
-    uint64_t *carry_degree = lut->get_degree(1);
-    uint64_t *carry_max_degree = lut->get_max_degree(1);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut_buffer_message,
-        message_degree, message_max_degree, params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        lut_f_message, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
-        carry_max_degree, params.glwe_dimension, params.polynomial_size,
-        params.message_modulus, params.carry_modulus, lut_f_carry,
-        gpu_memory_allocated);

    uint64_t lwe_indexes_size = 2 * sizeof(Torus);
    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
@@ -1118,9 +1151,15 @@ template <typename Torus> struct int_fullprop_buffer {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
+
    //
    // No broadcast is needed because full prop is done on 1 single GPU.
+    // By passing a single-GPU CudaStreams with streams.get_ith(0) the LUT is
+    // not broadcast.
    //
+    lut->generate_and_broadcast_lut(streams.get_ith(0), {0, 1},
+                                    {lut_f_message, lut_f_carry},
+                                    gpu_memory_allocated);

    tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -1238,9 +1277,10 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      if (total_ciphertexts > 0 ||
          reduce_degrees_for_single_carry_propagation) {
        uint64_t size_tracker = 0;
+        allocated_luts_message_carry = true;
        luts_message_carry = new int_radix_lut<Torus>(
            streams, params, 2, pbs_count, true, size_tracker);
-        allocated_luts_message_carry = true;
+
        uint64_t message_modulus_bits =
            (uint64_t)std::log2(params.message_modulus);
        uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
@@ -1256,7 +1296,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
            streams, upper_bound_num_blocks, size_tracker, true);
      }
    }
+
    if (allocated_luts_message_carry) {
+
      auto message_acc = luts_message_carry->get_lut(0, 0);
      auto carry_acc = luts_message_carry->get_lut(0, 1);

@@ -1268,22 +1310,11 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
        return x / message_modulus;
      };

-      // generate accumulators
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), message_acc,
-          luts_message_carry->get_degree(0),
-          luts_message_carry->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, message_modulus, params.carry_modulus,
-          lut_f_message, gpu_memory_allocated);
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), carry_acc,
-          luts_message_carry->get_degree(1),
-          luts_message_carry->get_max_degree(1), params.glwe_dimension,
-          params.polynomial_size, message_modulus, params.carry_modulus,
-          lut_f_carry, gpu_memory_allocated);
      auto active_gpu_count_mc =
          streams.active_gpu_subset(pbs_count, params.pbs_type);
-      luts_message_carry->broadcast_lut(active_gpu_count_mc);
+      luts_message_carry->generate_and_broadcast_lut(
+          active_gpu_count_mc, {0, 1}, {lut_f_message, lut_f_carry},
+          gpu_memory_allocated);
    }
  }
  int_sum_ciphertexts_vec_memory(
@@ -1418,10 +1449,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
                            uint32_t group_size, uint32_t big_lwe_size_bytes,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
-    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    grouping_size = group_size;
    group_resolved_carries = new CudaRadixCiphertextFFI;
@@ -1431,22 +1458,20 @@ template <typename Torus> struct int_seq_group_prop_memory {
        allocate_gpu_memory);

    int num_seq_luts = grouping_size - 1;
-    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
    lut_sequential_algorithm =
        new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
                                 allocate_gpu_memory, size_tracker);
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_indices;
+    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
+
    for (int index = 0; index < num_seq_luts; index++) {
      auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
        return (propa_cum_sum_block >> (index + 1)) & 1;
      };
-      auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), seq_lut,
-          lut_sequential_algorithm->get_degree(index),
-          lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
-          polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
-          gpu_memory_allocated);
+      lut_funcs.push_back(f_lut_sequential);
      h_seq_lut_indexes[index] = index;
+      lut_indices.push_back(index);
    }
    Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1454,9 +1479,12 @@ template <typename Torus> struct int_seq_group_prop_memory {
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_seq_luts, params.pbs_type);
-    lut_sequential_algorithm->broadcast_lut(active_streams);
+    lut_sequential_algorithm->generate_and_broadcast_lut(
+        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
+    // lut_sequential_algorithm->broadcast_lut(active_streams);
    free(h_seq_lut_indexes);
-  };
+  }
+
  void release(CudaStreams streams) {
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   group_resolved_carries,
@@ -1478,10 +1506,6 @@ template <typename Torus> struct int_hs_group_prop_memory {
                           uint32_t num_groups, uint32_t big_lwe_size_bytes,
                           bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
-    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
      if (msb == 2) {
@@ -1501,16 +1525,11 @@ template <typename Torus> struct int_hs_group_prop_memory {
    lut_hillis_steele = new int_radix_lut<Torus>(
        streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
-        lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, f_lut_hillis_steele,
-        gpu_memory_allocated);
    auto active_streams =
        streams.active_gpu_subset(num_groups, params.pbs_type);
-    lut_hillis_steele->broadcast_lut(active_streams);
-  };
+    lut_hillis_steele->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {f_lut_hillis_steele}, gpu_memory_allocated);
+  }
  void release(CudaStreams streams) {

    lut_hillis_steele->release(streams);
@@ -1800,112 +1819,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
      num_extra_luts = 1;
    }

-    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
-    luts_array_second_step = new int_radix_lut<Torus>(
-        streams, params, num_luts_second_step, num_radix_blocks,
-        allocate_gpu_memory, size_tracker);
-
-    // luts for first group inner propagation
-    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
-      auto f_first_grouping_inner_propagation =
-          [lut_id](Torus propa_cum_sum_block) -> Torus {
-        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
-
-        if (carry != 0) {
-          return 2ull; // Generates Carry
-        } else {
-          return 0ull; // Does not generate carry
-        }
-      };
-
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          luts_array_second_step->get_lut(0, lut_id),
-          luts_array_second_step->get_degree(lut_id),
-          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
-          polynomial_size, message_modulus, carry_modulus,
-          f_first_grouping_inner_propagation, gpu_memory_allocated);
-    }
-
-    auto f_first_grouping_outer_propagation =
-        [num_bits_in_block](Torus block) -> Torus {
-      return (block >> (num_bits_in_block - 1)) & 1;
-    };
-
-    int lut_id = grouping_size - 1;
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        luts_array_second_step->get_lut(0, lut_id),
-        luts_array_second_step->get_degree(lut_id),
-        luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
-        f_first_grouping_outer_propagation, gpu_memory_allocated);
-
-    // for other groupings inner propagation
-    for (int index = 0; index < grouping_size; index++) {
-      uint32_t lut_id = index + grouping_size;
-
-      auto f_other_groupings_inner_propagation =
-          [index](Torus propa_cum_sum_block) -> Torus {
-        uint64_t mask = (2 << index) - 1;
-        if (propa_cum_sum_block >= (2 << index)) {
-          return 2ull; // Generates
-        } else if ((propa_cum_sum_block & mask) == mask) {
-          return 1ull; // Propagate
-        } else {
-          return 0ull; // Nothing
-        }
-      };
-
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          luts_array_second_step->get_lut(0, lut_id),
-          luts_array_second_step->get_degree(lut_id),
-          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
-          polynomial_size, message_modulus, carry_modulus,
-          f_other_groupings_inner_propagation, gpu_memory_allocated);
-    }
-
-    if (use_sequential_algorithm_to_resolve_group_carries) {
-      for (int index = 0; index < grouping_size - 1; index++) {
-        uint32_t lut_id = index + 2 * grouping_size;
-
-        auto f_group_propagation = [index, block_modulus,
-                                    num_bits_in_block](Torus block) -> Torus {
-          if (block == (block_modulus - 1)) {
-            return 0ull;
-          } else {
-            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
-          }
-        };
-
-        generate_device_accumulator<Torus>(
-            streams.stream(0), streams.gpu_index(0),
-            luts_array_second_step->get_lut(0, lut_id),
-            luts_array_second_step->get_degree(lut_id),
-            luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
-            f_group_propagation, gpu_memory_allocated);
-      }
-    } else {
-      uint32_t lut_id = 2 * grouping_size;
-      auto f_group_propagation = [block_modulus](Torus block) {
-        if (block == (block_modulus - 1)) {
-          return 2ull;
-        } else {
-          return UINT64_MAX % (block_modulus * 2ull);
-        }
-      };
-
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          luts_array_second_step->get_lut(0, lut_id),
-          luts_array_second_step->get_degree(lut_id),
-          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
-          polynomial_size, message_modulus, carry_modulus, f_group_propagation,
-          gpu_memory_allocated);
-    }
-
    Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);

    for (int index = 0; index < num_radix_blocks; index++) {
@@ -1941,6 +1854,11 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
      }
    }

+    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
+    luts_array_second_step = new int_radix_lut<Torus>(
+        streams, params, num_luts_second_step, num_radix_blocks,
+        allocate_gpu_memory, size_tracker);
+
    // copy the indexes to the gpu
    Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1951,9 +1869,92 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
        scalar_array_cum_sum, h_scalar_array_cum_sum,
        num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
+
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_ids;
+
+    // luts for first group inner propagation
+    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
+      auto f_first_grouping_inner_propagation =
+          [lut_id](Torus propa_cum_sum_block) -> Torus {
+        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
+
+        if (carry != 0) {
+          return 2ull; // Generates Carry
+        } else {
+          return 0ull; // Does not generate carry
+        }
+      };
+      lut_funcs.push_back(f_first_grouping_inner_propagation);
+      lut_ids.push_back(lut_id);
+    }
+
+    auto f_first_grouping_outer_propagation =
+        [num_bits_in_block](Torus block) -> Torus {
+      return (block >> (num_bits_in_block - 1)) & 1;
+    };
+
+    int lut_id = grouping_size - 1;
+
+    lut_funcs.push_back(f_first_grouping_outer_propagation);
+    lut_ids.push_back(lut_id);
+
+    // for other groupings inner propagation
+    for (int index = 0; index < grouping_size; index++) {
+      uint32_t lut_id = index + grouping_size;
+
+      auto f_other_groupings_inner_propagation =
+          [index](Torus propa_cum_sum_block) -> Torus {
+        uint64_t mask = (2 << index) - 1;
+        if (propa_cum_sum_block >= (2 << index)) {
+          return 2ull; // Generates
+        } else if ((propa_cum_sum_block & mask) == mask) {
+          return 1ull; // Propagate
+        } else {
+          return 0ull; // Nothing
+        }
+      };
+
+      lut_funcs.push_back(f_other_groupings_inner_propagation);
+      lut_ids.push_back(lut_id);
+    }
+
+    if (use_sequential_algorithm_to_resolve_group_carries) {
+      for (int index = 0; index < grouping_size - 1; index++) {
+        uint32_t lut_id = index + 2 * grouping_size;
+
+        auto f_group_propagation = [index, block_modulus,
+                                    num_bits_in_block](Torus block) -> Torus {
+          if (block == (block_modulus - 1)) {
+            return 0ull;
+          } else {
+            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
+          }
+        };
+
+        lut_funcs.push_back(f_group_propagation);
+        lut_ids.push_back(lut_id);
+      }
+    } else {
+      uint32_t lut_id = 2 * grouping_size;
+      auto f_group_propagation = [block_modulus](Torus block) {
+        if (block == (block_modulus - 1)) {
+          return 2ull;
+        } else {
+          return UINT64_MAX % (block_modulus * 2ull);
+        }
+      };
+
+      lut_funcs.push_back(f_group_propagation);
+      lut_ids.push_back(lut_id);
+    }
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_array_second_step->broadcast_lut(active_streams);
+    luts_array_second_step->generate_and_broadcast_lut(
+        active_streams, lut_ids, lut_funcs, gpu_memory_allocated);
+
+    // luts_array_second_step->broadcast_lut(active_streams);

    if (use_sequential_algorithm_to_resolve_group_carries) {

@@ -2041,12 +2042,28 @@ template <typename Torus> struct int_sc_prop_memory {
  uint32_t requested_flag;
  bool gpu_memory_allocated;

+  void setup_message_extract_indices_for_carry_async(CudaStreams streams,
+                                                     uint32_t num_radix_blocks,
+                                                     bool allocate_gpu_memory) {
+    Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
+    for (int index = 0; index < num_radix_blocks + 1; index++) {
+      if (index < num_radix_blocks) {
+        h_lut_indexes[index] = 0;
+      } else {
+        h_lut_indexes[index] = 1;
+      }
+    }
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
+        (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
+        streams.gpu_index(0), allocate_gpu_memory);
+  }
+
  int_sc_prop_memory(CudaStreams streams, int_radix_params params,
                     uint32_t num_radix_blocks, uint32_t requested_flag_in,
                     bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
-    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
@@ -2069,24 +2086,6 @@ template <typename Torus> struct int_sc_prop_memory {
        streams, params, num_radix_blocks, grouping_size, num_groups,
        allocate_gpu_memory, size_tracker);

-    //  Step 3 elements
-    int num_luts_message_extract =
-        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
-    lut_message_extract = new int_radix_lut<Torus>(
-        streams, params, num_luts_message_extract, num_radix_blocks + 1,
-        allocate_gpu_memory, size_tracker);
-    // lut for the first block in the first grouping
-    auto f_message_extract = [message_modulus](Torus block) -> Torus {
-      return (block >> 1) % message_modulus;
-    };
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
-        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, f_message_extract,
-        gpu_memory_allocated);
-
    // This store a single block that with be used to store the overflow or
    // carry results
    output_flag = new CudaRadixCiphertextFFI;
@@ -2137,22 +2136,30 @@ template <typename Torus> struct int_sc_prop_memory {
        return output1 << 3 | output2 << 2;
      };

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          lut_overflow_flag_prep->get_lut(0, 0),
-          lut_overflow_flag_prep->get_degree(0),
-          lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
-          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
-          gpu_memory_allocated);
-
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      lut_overflow_flag_prep->broadcast_lut(active_streams);
+      lut_overflow_flag_prep->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {f_overflow_fp}, gpu_memory_allocated);
    }

+    //  Step 3 elements
+    int num_luts_message_extract =
+        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
+    lut_message_extract = new int_radix_lut<Torus>(
+        streams, params, num_luts_message_extract, num_radix_blocks + 1,
+        allocate_gpu_memory, size_tracker);
+    // lut for the first block in the first grouping
+    auto f_message_extract = [message_modulus](Torus block) -> Torus {
+      return (block >> 1) % message_modulus;
+    };
+
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
+
    // For the final cleanup in case of overflow or carry (it seems that I can)
    // It seems that this lut could be apply together with the other one but for
    // now we won't do it
-    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
+    switch (requested_flag) {
+    case outputFlag::FLAG_OVERFLOW: { // Overflow case
      auto f_overflow_last = [num_radix_blocks,
                              requested_flag_in](Torus block) -> Torus {
        uint32_t position = (num_radix_blocks == 1 &&
@@ -2164,62 +2171,38 @@ template <typename Torus> struct int_sc_prop_memory {
        Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
        if (input_carry == outputFlag::FLAG_OVERFLOW) {
          return does_overflow_if_carry_is_1;
-        } else {
-          return does_overflow_if_carry_is_0;
        }
+        return does_overflow_if_carry_is_0;
      };
+      setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
+                                                    allocate_gpu_memory);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          lut_message_extract->get_lut(0, 1),
-          lut_message_extract->get_degree(1),
-          lut_message_extract->get_max_degree(1), glwe_dimension,
-          polynomial_size, message_modulus, carry_modulus, f_overflow_last,
+      lut_message_extract->generate_and_broadcast_lut(
+          active_streams, {0, 1}, {f_message_extract, f_overflow_last},
          gpu_memory_allocated);
-
-      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
-      for (int index = 0; index < num_radix_blocks + 1; index++) {
-        if (index < num_radix_blocks) {
-          h_lut_indexes[index] = 0;
-        } else {
-          h_lut_indexes[index] = 1;
-        }
-      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
-          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
-          streams.gpu_index(0), allocate_gpu_memory);
+      break;
    }
-    if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
+    case outputFlag::FLAG_CARRY: { // Carry case
+
+      setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
+                                                    allocate_gpu_memory);

      auto f_carry_last = [](Torus block) -> Torus {
        return ((block >> 2) & 1);
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          lut_message_extract->get_lut(0, 1),
-          lut_message_extract->get_degree(1),
-          lut_message_extract->get_max_degree(1), glwe_dimension,
-          polynomial_size, message_modulus, carry_modulus, f_carry_last,
+      lut_message_extract->generate_and_broadcast_lut(
+          active_streams, {0, 1}, {f_message_extract, f_carry_last},
          gpu_memory_allocated);
-
-      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
-      for (int index = 0; index < num_radix_blocks + 1; index++) {
-        if (index < num_radix_blocks) {
-          h_lut_indexes[index] = 0;
-        } else {
-          h_lut_indexes[index] = 1;
-        }
-      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
-          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
-          streams.gpu_index(0), allocate_gpu_memory);
+      break;
    }
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
-    lut_message_extract->broadcast_lut(active_streams);
+    default:
+      lut_message_extract->generate_and_broadcast_lut(
+          active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
+      break;
+    }
+
+    // lut_message_extract->broadcast_lut(active_streams);
  };

  void release(CudaStreams streams) {
@@ -2517,16 +2500,11 @@ template <typename Torus> struct int_borrow_prop_memory {
      return (block >> 1) % message_modulus;
    };

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
-        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, f_message_extract,
-        gpu_memory_allocated);
    active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

-    lut_message_extract->broadcast_lut(active_streams);
+    lut_message_extract->generate_and_broadcast_lut(
+        active_streams, {0}, {f_message_extract}, gpu_memory_allocated);

    if (compute_overflow) {
      lut_borrow_flag =
@@ -2537,12 +2515,8 @@ template <typename Torus> struct int_borrow_prop_memory {
        return ((block >> 2) & 1);
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
-          lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
-          message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
-      lut_borrow_flag->broadcast_lut(active_streams);
+      lut_borrow_flag->generate_and_broadcast_lut(
+          active_streams, {0}, {f_borrow_flag}, gpu_memory_allocated);
    }

    active_streams =
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -37,17 +37,14 @@ template <typename Torus> struct int_mul_memory {
      zero_out_predicate_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          zero_out_predicate_lut->get_lut(0, 0),
-          zero_out_predicate_lut->get_degree(0),
-          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          zero_out_predicate_lut_f, gpu_memory_allocated);

      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      zero_out_predicate_lut->broadcast_lut(active_streams);
+      zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {zero_out_predicate_lut_f},
+          gpu_memory_allocated);
+
+      // zero_out_predicate_lut->broadcast_lut(active_streams);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -55,10 +52,7 @@ template <typename Torus> struct int_mul_memory {
      return;
    }

-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    // 'vector_result_lsb' contains blocks from all possible shifts of
    // radix_lwe_left excluding zero ciphertext blocks
@@ -102,18 +96,6 @@ template <typename Torus> struct int_mul_memory {
      return (x * y) / message_modulus;
    };

-    // generate accumulators
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), lsb_acc,
-        luts_array->get_degree(0), luts_array->get_max_degree(0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        lut_f_lsb, gpu_memory_allocated);
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), msb_acc,
-        luts_array->get_degree(1), luts_array->get_max_degree(1),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        lut_f_msb, gpu_memory_allocated);
-
    // lut_indexes_vec for luts_array should be reinitialized
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
@@ -123,9 +105,12 @@ template <typename Torus> struct int_mul_memory {
          streams.stream(0), streams.gpu_index(0),
          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
          msb_vector_block_count);
+
    auto active_streams =
        streams.active_gpu_subset(total_block_count, params.pbs_type);
-    luts_array->broadcast_lut(active_streams);
+    luts_array->generate_and_broadcast_bivariate_lut(
+        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
+
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, params, num_radix_blocks, 2 * num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -85,15 +85,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
-          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          shift_lut_f, gpu_memory_allocated);
+
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->broadcast_lut(active_streams);
+      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -172,16 +168,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
-          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          shift_lut_f, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->broadcast_lut(active_streams);
-
+      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
@@ -271,16 +261,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return shifted | padding;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          shift_last_block_lut_univariate->get_lut(0, 0),
-          shift_last_block_lut_univariate->get_degree(0),
-          shift_last_block_lut_univariate->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
      auto active_streams_shift_last =
          streams.active_gpu_subset(1, params.pbs_type);
-      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
+      shift_last_block_lut_univariate->generate_and_broadcast_lut(
+          active_streams_shift_last, {0}, {last_block_lut_f},
+          gpu_memory_allocated);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -298,15 +283,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      return (params.message_modulus - 1) * x_sign_bit;
    };

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        padding_block_lut_univariate->get_lut(0, 0),
-        padding_block_lut_univariate->get_degree(0),
-        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        padding_block_lut_f, gpu_memory_allocated);
-    // auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-    padding_block_lut_univariate->broadcast_lut(active_streams);
+    padding_block_lut_univariate->generate_and_broadcast_lut(
+        active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -339,16 +317,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return message_of_current_block + carry_of_previous_block;
      };

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          shift_blocks_lut_bivariate->get_lut(0, 0),
-          shift_blocks_lut_bivariate->get_degree(0),
-          shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
+      shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams_shift_blocks, {0}, {blocks_lut_f},
+          gpu_memory_allocated);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -113,27 +113,21 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
      else
        return current_bit;
    };
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
-        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
+    ;
    auto active_gpu_count_mux = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);
-    mux_lut->broadcast_lut(active_gpu_count_mux);
+
+    mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
+                                        gpu_memory_allocated);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
      return x % params.message_modulus;
    };
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
-        cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
+
    auto active_gpu_count_cleaning =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
+    cleaning_lut->generate_and_broadcast_lut(
+        active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -74,45 +74,26 @@ template <typename Torus> struct int_overflowing_sub_memory {
                                           luts_array, size_tracker,
                                           allocate_gpu_memory, size_tracker);

-    auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
-    auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
-
-    // generate luts (aka accumulators)
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
-        luts_array->get_degree(0), luts_array->get_max_degree(0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        f_lut_does_block_generate_carry, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        lut_does_block_generate_or_propagate, luts_array->get_degree(1),
-        luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
-        gpu_memory_allocated);
    if (allocate_gpu_memory)
      cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                  luts_array->get_lut_indexes(0, 1), 1,
                                  num_radix_blocks - 1);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        luts_borrow_propagation_sum->get_lut(0, 0),
-        luts_borrow_propagation_sum->get_degree(0),
-        luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
-        f_luts_borrow_propagation_sum, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
-        message_acc->get_degree(0), message_acc->get_max_degree(0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        f_message_acc, gpu_memory_allocated);
-
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_array->broadcast_lut(active_streams);
-    luts_borrow_propagation_sum->broadcast_lut(active_streams);
-    message_acc->broadcast_lut(active_streams);
+    luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {f_luts_borrow_propagation_sum},
+        gpu_memory_allocated);
+
+    luts_array->generate_and_broadcast_lut(
+        active_streams, {0, 1},
+        {f_lut_does_block_generate_carry,
+         f_lut_does_block_generate_or_propagate},
+        gpu_memory_allocated);
+    // generate luts (aka accumulators)
+
+    message_acc->generate_and_broadcast_lut(
+        active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -298,14 +298,10 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
      int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          id_fn, allocate_gpu_memory);
+      lut->generate_and_broadcast_lut(
+          streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
+          allocate_gpu_memory);

-      lut->broadcast_lut(
-          streams.active_gpu_subset(num_blocks, params.pbs_type));
      this->stream_identity_luts[i] = lut;
    }

@@ -318,27 +314,17 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->message_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->message_extract_lut->get_lut(0, 0),
-        this->message_extract_lut->get_degree(0),
-        this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        msg_fn, allocate_gpu_memory);
-    this->message_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type));
+
+    this->message_extract_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
+        allocate_gpu_memory);

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->carry_extract_lut->get_lut(0, 0),
-        this->carry_extract_lut->get_degree(0),
-        this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        carry_fn, allocate_gpu_memory);
-    this->carry_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type));
+
+    this->carry_extract_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
+        allocate_gpu_memory);

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -1185,15 +1171,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->prefix_sum_lut->get_lut(0, 0),
-        this->prefix_sum_lut->get_degree(0),
-        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {prefix_sum_fn}, allocate_gpu_memory);

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1203,14 +1183,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
-        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->cleanup_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {cleanup_fn}, allocate_gpu_memory);
  }

  void release(CudaStreams streams) {
@@ -1376,15 +1351,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->prefix_sum_lut->get_lut(0, 0),
-        this->prefix_sum_lut->get_degree(0),
-        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {prefix_sum_fn}, allocate_gpu_memory);

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1394,14 +1363,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
-        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->cleanup_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {cleanup_fn}, allocate_gpu_memory);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
@@ -30,15 +30,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
-        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, and_lambda, allocate_gpu_memory);
-
    auto active_streams_and =
        streams.active_gpu_subset(total_lut_ops, params.pbs_type);
-    this->and_lut->broadcast_lut(active_streams_and);
+    this->and_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_and, {0}, {and_lambda}, allocate_gpu_memory);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
@@ -50,15 +45,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
      return x & 1;
    };

-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
-        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, flush_lambda, allocate_gpu_memory);
-
    auto active_streams_flush =
        streams.active_gpu_subset(total_flush_ops, params.pbs_type);
-    this->flush_lut->broadcast_lut(active_streams_flush);
+    this->flush_lut->generate_and_broadcast_lut(
+        active_streams_flush, {0}, {flush_lambda}, allocate_gpu_memory);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -174,40 +174,6 @@ template <typename Torus> struct zk_expand_mem {
    message_and_carry_extract_luts = new int_radix_lut<Torus>(
        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 0),
-        message_and_carry_extract_luts->get_degree(0),
-        message_and_carry_extract_luts->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 1),
-        message_and_carry_extract_luts->get_degree(1),
-        message_and_carry_extract_luts->get_max_degree(1),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 2),
-        message_and_carry_extract_luts->get_degree(2),
-        message_and_carry_extract_luts->get_max_degree(2),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
-        gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 3),
-        message_and_carry_extract_luts->get_degree(3),
-        message_and_carry_extract_luts->get_max_degree(3),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
-        gpu_memory_allocated);
-
    // We are always packing two LWEs. We just need to be sure we have enough
    // space in the carry part to store a message of the same size as is in the
    // message part.
@@ -292,7 +258,13 @@ template <typename Torus> struct zk_expand_mem {

    auto active_streams =
        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
-    message_and_carry_extract_luts->broadcast_lut(active_streams);
+
+    message_and_carry_extract_luts->generate_and_broadcast_lut(
+        active_streams, {0, 1, 2, 3},
+        {message_extract_lut_f, carry_extract_lut_f,
+         message_extract_and_sanitize_bool_lut_f,
+         carry_extract_and_sanitize_bool_lut_f},
+        gpu_memory_allocated);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -1067,6 +1067,85 @@ void generate_device_accumulator_bivariate(
  POP_RANGE()
 }

+template <typename Torus> struct int_lut_cache {
+  int_lut_cache() {}
+
+  Torus *get_cached_univariate_lut(std::function<Torus(Torus)> &f, uint64_t *degree,
+                        uint64_t *max_degree, uint32_t glwe_dimension,
+                        uint32_t polynomial_size,
+                        uint32_t input_message_modulus,
+                        uint32_t input_carry_modulus,
+                        uint32_t output_message_modulus,
+                        uint32_t output_carry_modulus) {
+    /*__int128_t f_hash = 0;
+    uint32_t bits_per_lut_val = 5;
+    uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
+    for (uint32_t i = 0; i < input_modulus_sup; ++i) {
+      Torus f_eval = f(i);
+      GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
+                 "LUT value expected bitwidth overflow");
+      f_hash |= f_eval;
+      f_hash <<= bits_per_lut_val;
+    }
+
+    std::lock_guard cache_lock(_mutex);
+    if (_lut_cache.find(f_hash) != _lut_cache.end()) {
+      lut_ptr &ptr = _lut_cache[f_hash];
+      GPU_ASSERT(ptr.output_message_modulus == output_message_modulus,
+                 "Error modulus");
+      GPU_ASSERT(ptr.input_message_modulus == input_message_modulus,
+                 "Error modulus");
+      GPU_ASSERT(ptr.glwe_dimension == glwe_dimension, "Error modulus");
+      *max_degree = ptr.max_degree;
+      *degree = ptr.degree;
+      return ptr.ptr;
+    }*/
+
+    // host lut
+    Torus *h_lut =
+        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+    *max_degree = input_message_modulus * input_carry_modulus - 1;
+    *degree = generate_lookup_table_with_encoding<Torus>(
+        h_lut, glwe_dimension, polynomial_size, input_message_modulus,
+        input_carry_modulus, output_message_modulus, output_carry_modulus, f);
+
+    /*lut_ptr new_ptr = {h_lut,
+                       glwe_dimension,
+                       input_message_modulus,
+                       input_carry_modulus,
+                       output_message_modulus,
+                       output_carry_modulus,
+                       *max_degree,
+                       *degree};*/
+    //_lut_cache[f_hash] = new_ptr;
+    return h_lut;
+  }
+
+  ~int_lut_cache() {
+    std::lock_guard cache_lock(_mutex);
+    for (auto v : _lut_cache) {
+      free(v.second.ptr);
+    }
+    _lut_cache.clear();
+  }
+
+private:
+  struct lut_ptr {
+    Torus *ptr;
+    uint32_t glwe_dimension;
+    uint32_t input_message_modulus;
+    uint32_t input_carry_modulus;
+    uint32_t output_message_modulus;
+    uint32_t output_carry_modulus;
+    uint64_t max_degree;
+    uint64_t degree;
+  };
+  std::map<__int128_t, lut_ptr> _lut_cache;
+  std::mutex _mutex;
+};
+static int_lut_cache<uint64_t> g_LutCache64;
+
 /*
 *  generate bivariate accumulator with factor scaling for device pointer
 *    v_stream - cuda stream
@@ -1098,8 +1177,8 @@ void generate_device_accumulator_bivariate_with_factor(
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
      gpu_memory_allocated);

-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
+//  cuda_synchronize_stream(stream, gpu_index);
+//  free(h_lut);
 }
 /*
 *  generate bivariate accumulator for device pointer
@@ -1145,23 +1224,36 @@ void generate_device_accumulator_with_encoding(
    uint32_t output_message_modulus, uint32_t output_carry_modulus,
    std::function<Torus(Torus)> f, bool gpu_memory_allocated) {

+  static constexpr auto is_u64 = std::is_same_v<Torus, uint64_t>;
+  Torus *h_lut = nullptr;
  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-  *max_degree = input_message_modulus * input_carry_modulus - 1;
-  // fill accumulator
-  *degree = generate_lookup_table_with_encoding<Torus>(
-      h_lut, glwe_dimension, polynomial_size, input_message_modulus,
-      input_carry_modulus, output_message_modulus, output_carry_modulus, f);
+  if constexpr (is_u64) {
+    h_lut = g_LutCache64.get_cached_univariate_lut(
+        f, degree, max_degree, glwe_dimension, polynomial_size,
+        input_message_modulus, input_carry_modulus, output_message_modulus,
+        output_carry_modulus);
+  } else {
+    h_lut =
+        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));

+    *max_degree = input_message_modulus * input_carry_modulus - 1;
+    // fill accumulator
+    *degree = generate_lookup_table_with_encoding<Torus>(
+        h_lut, glwe_dimension, polynomial_size, input_message_modulus,
+        input_carry_modulus, output_message_modulus, output_carry_modulus, f);
+  }
+/*
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);
-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
+*/
+  if (!std::is_same_v<Torus, uint64_t>) {
+    cuda_synchronize_stream(stream, gpu_index);
+    free(h_lut);
+  }
 }
+
 template <typename Torus>
 void generate_device_accumulator_with_encoding_with_cpu_prealloc(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
@@ -1264,8 +1356,8 @@ void generate_many_lut_device_accumulator(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);

-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
+  //cuda_synchronize_stream(stream, gpu_index);
+  //free(h_lut);
  POP_RANGE()
 }
Author	SHA1	Message	Date
Andrei Stoian	b4ea48165b	fix(gpu): disable cache	2026-01-23 15:50:37 +01:00
Andrei Stoian	0a6b62627d	fix(gpu): remove broadcast	2026-01-23 13:00:44 +01:00
Andrei Stoian	6deeb66bf8	fix(gpu): test remove sync on lut create	2026-01-23 11:27:47 +01:00
Andrei Stoian	17022dae69	feat(gpu): lut cache univariate	2026-01-22 16:56:14 +01:00
Andrei Stoian	09802dd5ee	feat(gpu): lut cache	2026-01-22 10:04:39 +01:00
Andrei Stoian	e3fe433a35	fix(gpu): univariate fix	2026-01-21 17:24:16 +01:00
Andrei Stoian	2bea35a3b5	fix(gpu): finish bivariate	2026-01-21 16:13:21 +01:00
Andrei Stoian	e2bf226276	fix(gpu): start bivariate, fix all univariate	2026-01-21 15:24:51 +01:00
Andrei Stoian	c66f1c6d8b	fix(gpu): all univariate luts	2026-01-21 12:06:21 +01:00
Andrei Stoian	9bfe190ad3	fix(gpu): sc prop fix	2026-01-21 11:48:21 +01:00
Andrei Stoian	e40070db0e	fix(gpu): sc prop encapsulate lut	2026-01-21 10:35:21 +01:00
Andrei Stoian	e8d5ceac68	fix(gpu): more lut encaps	2026-01-20 15:43:24 +01:00
Andrei Stoian	f1526b29d8	fix(gpu): more lut	2026-01-19 17:55:45 +01:00
Andrei Stoian	602e0c5a19	fix(gpu): more lut encaps	2026-01-19 15:13:07 +01:00
Andrei Stoian	163c1eeffb	chore(gpu): refactor lut generation	2026-01-16 16:02:17 +01:00