Use internal streams

Wrap erc20 from backend to hl api
chore(gpu): reuse CPU LUT buffer to generate accumulators
2026-01-13 16:47:59 -05:00 · 2025-12-05 15:28:07 +01:00 · 2025-12-05 15:23:08 +01:00 · 2025-12-05 15:23:07 +01:00 · 2025-12-05 15:23:07 +01:00
28 changed files with 992 additions and 171 deletions
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -86,6 +86,7 @@ fn main() {
            "cuda/include/integer/integer.h",
            "cuda/include/integer/rerand.h",
            "cuda/include/aes/aes.h",
+            "cuda/include/erc20/erc20.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/include/erc20/erc20.h
+++ b/backends/tfhe-cuda-backend/cuda/include/erc20/erc20.h
@@ -0,0 +1,20 @@
+#pragma once
+#include "../integer/integer.h"
+extern "C" {
+uint64_t scratch_cuda_erc20_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_erc20_assign_64(CudaStreamsFFI streams,
+                          CudaRadixCiphertextFFI *from_amount,
+                          CudaRadixCiphertextFFI *to_amount,
+                          CudaRadixCiphertextFFI const *amount, int8_t *mem_ptr,
+                          void *const *bsks, void *const *ksks);
+
+void cleanup_cuda_erc20(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+}
--- a/backends/tfhe-cuda-backend/cuda/include/erc20/erc20_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/erc20/erc20_utilities.h
@@ -0,0 +1,82 @@
+#pragma once
+#include "../integer/integer_utilities.h"
+#include "integer/comparison.h"
+#include "integer/multiplication.h"
+#include "integer/subtraction.h"
+
+template <typename Torus> struct int_erc20_buffer {
+  int_radix_params params;
+
+  int_comparison_buffer<Torus> *diff_buffer;
+  int_mul_memory<Torus> *mul_buffer;
+  int_sc_prop_memory<Torus> *add_buffer;
+  int_sub_and_propagate<Torus> *sub_buffer;
+  CudaRadixCiphertextFFI *tmp_amount;
+  CudaRadixCiphertextFFI *has_enough_funds;
+  CudaStreams active_streams;
+  InternalCudaStreams internal_cuda_streams;
+  uint32_t num_internal_streams;
+  bool allocate_gpu_memory;
+  Torus *preallocated_h_lut;
+
+  int_erc20_buffer(CudaStreams streams, int_radix_params params,
+                   uint32_t num_radix_blocks, bool allocate_gpu_memory,
+                   uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    preallocated_h_lut = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    diff_buffer = new int_comparison_buffer<Torus>(
+        streams, COMPARISON_TYPE::GT, params, num_radix_blocks, false,
+        allocate_gpu_memory, size_tracker, preallocated_h_lut);
+    mul_buffer = new int_mul_memory<Torus>(
+        streams, params, false, true, num_radix_blocks, allocate_gpu_memory,
+        size_tracker, preallocated_h_lut);
+    add_buffer = new int_sc_prop_memory<Torus>(
+        streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
+        size_tracker, preallocated_h_lut);
+    sub_buffer = new int_sub_and_propagate<Torus>(
+        streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
+        size_tracker, preallocated_h_lut);
+    tmp_amount = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), tmp_amount, num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    has_enough_funds = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), has_enough_funds, 1,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    active_streams = streams.active_gpu_subset(num_radix_blocks);
+    num_internal_streams = 2;
+    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
+        active_streams, num_internal_streams);
+  }
+
+  void release(CudaStreams streams) {
+    diff_buffer->release(streams);
+    delete diff_buffer;
+    diff_buffer = nullptr;
+    mul_buffer->release(streams);
+    delete mul_buffer;
+    mul_buffer = nullptr;
+    add_buffer->release(streams);
+    delete add_buffer;
+    add_buffer = nullptr;
+    sub_buffer->release(streams);
+    delete sub_buffer;
+    sub_buffer = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   tmp_amount, this->allocate_gpu_memory);
+    delete tmp_amount;
+    tmp_amount = nullptr;
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   has_enough_funds, this->allocate_gpu_memory);
+    delete has_enough_funds;
+    has_enough_funds = nullptr;
+
+    internal_cuda_streams.release(streams);
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    free(preallocated_h_lut);
+  }
+};
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -43,7 +43,8 @@ template <typename Torus> struct int_cmux_buffer {
  int_cmux_buffer(CudaStreams streams,
                  std::function<Torus(Torus)> predicate_lut_f,
                  int_radix_params params, uint32_t num_radix_blocks,
-                  bool allocate_gpu_memory, uint64_t &size_tracker) {
+                  bool allocate_gpu_memory, uint64_t &size_tracker,
+                  Torus *preallocated_h_lut = nullptr) {
    gpu_memory_allocated = allocate_gpu_memory;

    this->params = params;
@@ -88,20 +89,21 @@ template <typename Torus> struct int_cmux_buffer {
        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
        predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
+        params.carry_modulus, inverted_lut_f, gpu_memory_allocated,
+        preallocated_h_lut);

    generate_device_accumulator_bivariate<Torus>(
        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
        predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, lut_f, gpu_memory_allocated);
+        params.carry_modulus, lut_f, gpu_memory_allocated, preallocated_h_lut);

    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0),
        message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
        message_extract_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        message_extract_lut_f, gpu_memory_allocated);
+        message_extract_lut_f, gpu_memory_allocated, preallocated_h_lut);
    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
    for (int index = 0; index < 2 * num_radix_blocks; index++) {
      if (index < num_radix_blocks) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -384,7 +384,8 @@ template <typename Torus> struct int_comparison_buffer {
  int_comparison_buffer(CudaStreams streams, COMPARISON_TYPE op,
                        int_radix_params params, uint32_t num_radix_blocks,
                        bool is_signed, bool allocate_gpu_memory,
-                        uint64_t &size_tracker) {
+                        uint64_t &size_tracker,
+                        Torus *preallocated_h_lut_from_elsewhere = nullptr) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    this->op = op;
@@ -426,7 +427,8 @@ template <typename Torus> struct int_comparison_buffer {
        streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
+        params.carry_modulus, identity_lut_f, gpu_memory_allocated,
+        preallocated_h_lut_from_elsewhere);
    identity_lut->broadcast_lut(active_streams);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
@@ -441,7 +443,8 @@ template <typename Torus> struct int_comparison_buffer {
        streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
        is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_zero_f, gpu_memory_allocated);
+        params.carry_modulus, is_zero_f, gpu_memory_allocated,
+        preallocated_h_lut_from_elsewhere);

    is_zero_lut->broadcast_lut(active_streams);

@@ -456,7 +459,8 @@ template <typename Torus> struct int_comparison_buffer {
            else
              return (x == IS_INFERIOR);
          },
-          params, num_radix_blocks, allocate_gpu_memory, size_tracker);
+          params, num_radix_blocks, allocate_gpu_memory, size_tracker,
+          preallocated_h_lut_from_elsewhere);
    case COMPARISON_TYPE::GT:
    case COMPARISON_TYPE::GE:
    case COMPARISON_TYPE::LT:
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -77,7 +77,8 @@ void generate_device_accumulator_bivariate(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
    uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated);
+    std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
+    Torus *preallocated_h_lut = nullptr);

 template <typename Torus>
 void generate_device_accumulator_bivariate_with_factor(
@@ -114,14 +115,16 @@ void generate_device_accumulator(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus)> f, bool gpu_memory_allocated);
+    std::function<Torus(Torus)> f, bool gpu_memory_allocated,
+    Torus *preallocated_h_lut = nullptr);

 template <typename Torus>
 void generate_many_lut_device_accumulator(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees,
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t carry_modulus,
-    std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated);
+    std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated,
+    Torus *preallocated_h_lut = nullptr);

 struct radix_columns {
  std::vector<uint32_t> columns_counter;
@@ -1160,6 +1163,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {

  bool mem_reuse = false;
  bool allocated_luts_message_carry;
+  Torus *preallocated_h_lut;

  void setup_index_buffers(CudaStreams streams, uint64_t &size_tracker) {

@@ -1206,7 +1210,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
  }

  void setup_lookup_tables(CudaStreams streams, uint32_t num_radix_in_vec,
-                           const uint64_t *const degrees) {
+                           const uint64_t *const degrees,
+                           Torus *preallocated_h_lut = nullptr) {
    uint32_t message_modulus = params.message_modulus;
    bool _needs_processing = false;
    radix_columns current_columns(degrees, num_blocks_in_radix,
@@ -1257,13 +1262,13 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
          luts_message_carry->get_degree(0),
          luts_message_carry->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, message_modulus, params.carry_modulus,
-          lut_f_message, gpu_memory_allocated);
+          lut_f_message, gpu_memory_allocated, preallocated_h_lut);
      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), carry_acc,
          luts_message_carry->get_degree(1),
          luts_message_carry->get_max_degree(1), params.glwe_dimension,
          params.polynomial_size, message_modulus, params.carry_modulus,
-          lut_f_carry, gpu_memory_allocated);
+          lut_f_carry, gpu_memory_allocated, preallocated_h_lut);
      auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
      luts_message_carry->broadcast_lut(active_gpu_count_mc);
    }
@@ -1272,7 +1277,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      CudaStreams streams, int_radix_params params,
      uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
      bool reduce_degrees_for_single_carry_propagation,
-      bool allocate_gpu_memory, uint64_t &size_tracker) {
+      bool allocate_gpu_memory, uint64_t &size_tracker,
+      Torus *preallocated_h_lut = nullptr) {
    this->params = params;
    this->mem_reuse = false;
    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
@@ -1284,6 +1290,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->allocated_luts_message_carry = false;
    this->reduce_degrees_for_single_carry_propagation =
        reduce_degrees_for_single_carry_propagation;
+    this->preallocated_h_lut = preallocated_h_lut;

    setup_index_buffers(streams, size_tracker);
    // because we setup_lut in host function for sum_ciphertexts to save memory
@@ -1318,7 +1325,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      CudaRadixCiphertextFFI *small_lwe_vector,
      int_radix_lut<Torus> *reused_lut,
      bool reduce_degrees_for_single_carry_propagation,
-      bool allocate_gpu_memory, uint64_t &size_tracker) {
+      bool allocate_gpu_memory, uint64_t &size_tracker,
+      Torus *preallocated_h_lut = nullptr) {
    this->mem_reuse = true;
    this->params = params;
    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
@@ -1334,6 +1342,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->current_blocks = current_blocks;
    this->small_lwe_vector = small_lwe_vector;
    this->luts_message_carry = reused_lut;
+    this->preallocated_h_lut = preallocated_h_lut;

    uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
    uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
@@ -1395,10 +1404,12 @@ template <typename Torus> struct int_seq_group_prop_memory {
  int_radix_lut<Torus> *lut_sequential_algorithm;
  uint32_t grouping_size;
  bool gpu_memory_allocated;
+  Torus *h_seq_lut_indexes;

  int_seq_group_prop_memory(CudaStreams streams, int_radix_params params,
                            uint32_t group_size, uint32_t big_lwe_size_bytes,
-                            bool allocate_gpu_memory, uint64_t &size_tracker) {
+                            bool allocate_gpu_memory, uint64_t &size_tracker,
+                            Torus *preallocated_h_lut = nullptr) {
    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
@@ -1413,7 +1424,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
        allocate_gpu_memory);

    int num_seq_luts = grouping_size - 1;
-    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
+    h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
    lut_sequential_algorithm =
        new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
                                 allocate_gpu_memory, size_tracker);
@@ -1427,7 +1438,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
          lut_sequential_algorithm->get_degree(index),
          lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
-          gpu_memory_allocated);
+          gpu_memory_allocated, preallocated_h_lut);
      h_seq_lut_indexes[index] = index;
    }
    Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
@@ -1436,7 +1447,6 @@ template <typename Torus> struct int_seq_group_prop_memory {
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams = streams.active_gpu_subset(num_seq_luts);
    lut_sequential_algorithm->broadcast_lut(active_streams);
-    free(h_seq_lut_indexes);
  };
  void release(CudaStreams streams) {
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
@@ -1446,6 +1456,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
    delete group_resolved_carries;
    delete lut_sequential_algorithm;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    free(h_seq_lut_indexes);
  };
 };

@@ -1457,7 +1468,8 @@ template <typename Torus> struct int_hs_group_prop_memory {

  int_hs_group_prop_memory(CudaStreams streams, int_radix_params params,
                           uint32_t num_groups, uint32_t big_lwe_size_bytes,
-                           bool allocate_gpu_memory, uint64_t &size_tracker) {
+                           bool allocate_gpu_memory, uint64_t &size_tracker,
+                           Torus *preallocated_h_lut = nullptr) {
    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
@@ -1487,7 +1499,7 @@ template <typename Torus> struct int_hs_group_prop_memory {
        lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
        lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_lut_hillis_steele,
-        gpu_memory_allocated);
+        gpu_memory_allocated, preallocated_h_lut);
    auto active_streams = streams.active_gpu_subset(num_groups);
    lut_hillis_steele->broadcast_lut(active_streams);
  };
@@ -1511,7 +1523,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
  int_shifted_blocks_and_states_memory(
      CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
-      uint64_t &size_tracker) {
+      uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
@@ -1561,7 +1573,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        streams.stream(0), streams.gpu_index(0), first_block_lut,
        first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
-        gpu_memory_allocated);
+        gpu_memory_allocated, preallocated_h_lut);

    // luts for other blocks of the first grouping
    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
@@ -1584,7 +1596,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
      generate_many_lut_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
-          carry_modulus, f_grouping_luts, gpu_memory_allocated);
+          carry_modulus, f_grouping_luts, gpu_memory_allocated,
+          preallocated_h_lut);
    }

    // luts for the rest of groupings (except for the last block)
@@ -1610,7 +1623,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
      generate_many_lut_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
-          carry_modulus, f_grouping_luts, gpu_memory_allocated);
+          carry_modulus, f_grouping_luts, gpu_memory_allocated,
+          preallocated_h_lut);
    }

    // For the last block we need to generate a new lut
@@ -1635,7 +1649,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        streams.stream(0), streams.gpu_index(0), last_block_lut,
        last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
-        gpu_memory_allocated);
+        gpu_memory_allocated, preallocated_h_lut);

    // Generate the indexes to switch between luts within the pbs
    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
@@ -1706,11 +1720,12 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
  uint32_t group_size;
  bool use_sequential_algorithm_to_resolve_group_carries;
  bool gpu_memory_allocated;
+  Torus *h_second_lut_indexes;

  int_prop_simu_group_carries_memory(
      CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory,
-      uint64_t &size_tracker) {
+      uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
@@ -1803,7 +1818,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
          luts_array_second_step->get_degree(lut_id),
          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus,
-          f_first_grouping_inner_propagation, gpu_memory_allocated);
+          f_first_grouping_inner_propagation, gpu_memory_allocated,
+          preallocated_h_lut);
    }

    auto f_first_grouping_outer_propagation =
@@ -1818,7 +1834,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
        luts_array_second_step->get_degree(lut_id),
        luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus,
-        f_first_grouping_outer_propagation, gpu_memory_allocated);
+        f_first_grouping_outer_propagation, gpu_memory_allocated,
+        preallocated_h_lut);

    // for other groupings inner propagation
    for (int index = 0; index < grouping_size; index++) {
@@ -1842,7 +1859,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
          luts_array_second_step->get_degree(lut_id),
          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus,
-          f_other_groupings_inner_propagation, gpu_memory_allocated);
+          f_other_groupings_inner_propagation, gpu_memory_allocated,
+          preallocated_h_lut);
    }

    if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -1864,7 +1882,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
            luts_array_second_step->get_degree(lut_id),
            luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
            polynomial_size, message_modulus, carry_modulus,
-            f_group_propagation, gpu_memory_allocated);
+            f_group_propagation, gpu_memory_allocated, preallocated_h_lut);
      }
    } else {
      uint32_t lut_id = 2 * grouping_size;
@@ -1882,10 +1900,10 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
          luts_array_second_step->get_degree(lut_id),
          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_group_propagation,
-          gpu_memory_allocated);
+          gpu_memory_allocated, preallocated_h_lut);
    }

-    Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
+    h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);

    for (int index = 0; index < num_radix_blocks; index++) {
      uint32_t grouping_index = index / grouping_size;
@@ -1937,15 +1955,13 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {

      seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
          streams, params, grouping_size, big_lwe_size_bytes,
-          allocate_gpu_memory, size_tracker);
+          allocate_gpu_memory, size_tracker, preallocated_h_lut);

    } else {
      hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
          streams, params, num_groups, big_lwe_size_bytes, allocate_gpu_memory,
-          size_tracker);
+          size_tracker, preallocated_h_lut);
    }
-
-    free(h_second_lut_indexes);
  };

  // needed for the division to update the lut indexes
@@ -1996,6 +2012,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
    delete luts_array_second_step;
    delete[] h_scalar_array_cum_sum;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    free(h_second_lut_indexes);
  };
 };

@@ -2020,7 +2037,8 @@ template <typename Torus> struct int_sc_prop_memory {

  int_sc_prop_memory(CudaStreams streams, int_radix_params params,
                     uint32_t num_radix_blocks, uint32_t requested_flag_in,
-                     bool allocate_gpu_memory, uint64_t &size_tracker) {
+                     bool allocate_gpu_memory, uint64_t &size_tracker,
+                     Torus *preallocated_h_lut = nullptr) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    auto glwe_dimension = params.glwe_dimension;
@@ -2040,11 +2058,11 @@ template <typename Torus> struct int_sc_prop_memory {

    shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
        streams, params, num_radix_blocks, num_many_lut, grouping_size,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, size_tracker, preallocated_h_lut);

    prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
        streams, params, num_radix_blocks, grouping_size, num_groups,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, size_tracker, preallocated_h_lut);

    //  Step 3 elements
    int num_luts_message_extract =
@@ -2061,8 +2079,8 @@ template <typename Torus> struct int_sc_prop_memory {
        streams.stream(0), streams.gpu_index(0),
        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, f_message_extract,
-        gpu_memory_allocated);
+        message_modulus, carry_modulus, f_message_extract, gpu_memory_allocated,
+        preallocated_h_lut);

    // This store a single block that with be used to store the overflow or
    // carry results
@@ -2120,7 +2138,7 @@ template <typename Torus> struct int_sc_prop_memory {
          lut_overflow_flag_prep->get_degree(0),
          lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
-          gpu_memory_allocated);
+          gpu_memory_allocated, preallocated_h_lut);

      auto active_streams = streams.active_gpu_subset(1);
      lut_overflow_flag_prep->broadcast_lut(active_streams);
@@ -2152,7 +2170,7 @@ template <typename Torus> struct int_sc_prop_memory {
          lut_message_extract->get_degree(1),
          lut_message_extract->get_max_degree(1), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_overflow_last,
-          gpu_memory_allocated);
+          gpu_memory_allocated, preallocated_h_lut);

      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
      for (int index = 0; index < num_radix_blocks + 1; index++) {
@@ -2179,7 +2197,7 @@ template <typename Torus> struct int_sc_prop_memory {
          lut_message_extract->get_degree(1),
          lut_message_extract->get_max_degree(1), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_carry_last,
-          gpu_memory_allocated);
+          gpu_memory_allocated, preallocated_h_lut);

      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
      for (int index = 0; index < num_radix_blocks + 1; index++) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -21,7 +21,7 @@ template <typename Torus> struct int_mul_memory {
  int_mul_memory(CudaStreams streams, int_radix_params params,
                 bool const is_boolean_left, bool const is_boolean_right,
                 uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                 uint64_t &size_tracker) {
+                 uint64_t &size_tracker, Torus *preallocated_h_lut = nullptr) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->boolean_mul = is_boolean_left || is_boolean_right;
    this->params = params;
@@ -43,7 +43,7 @@ template <typename Torus> struct int_mul_memory {
          zero_out_predicate_lut->get_degree(0),
          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          zero_out_predicate_lut_f, gpu_memory_allocated);
+          zero_out_predicate_lut_f, gpu_memory_allocated, preallocated_h_lut);

      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
      zero_out_predicate_lut->broadcast_lut(active_streams);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -142,14 +142,15 @@ template <typename Torus> struct int_sub_and_propagate {

  int_sub_and_propagate(CudaStreams streams, const int_radix_params params,
                        uint32_t num_radix_blocks, uint32_t requested_flag_in,
-                        bool allocate_gpu_memory, uint64_t &size_tracker) {
+                        bool allocate_gpu_memory, uint64_t &size_tracker,
+                        Torus *preallocated_h_lut = nullptr) {

    this->params = params;
    this->allocate_gpu_memory = allocate_gpu_memory;

    this->sc_prop_mem = new int_sc_prop_memory<Torus>(
        streams, params, num_radix_blocks, requested_flag_in,
-        allocate_gpu_memory, size_tracker);
+        allocate_gpu_memory, size_tracker, preallocated_h_lut);

    this->neg_rhs_array = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,6 +1,5 @@
 file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
-                                     pbs/programmable_bootstrap_multibit_128.cu)
+add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cu
@@ -0,0 +1,85 @@
+#include "erc20/erc20.cuh"
+
+uint64_t scratch_cuda_erc20_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {
+  PUSH_RANGE("scratch erc20")
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, noise_reduction_type);
+
+  std::function<uint64_t(uint64_t)> predicate_lut_f =
+      [](uint64_t x) -> uint64_t { return x == 1; };
+
+  uint64_t ret = scratch_cuda_erc20<uint64_t>(
+      CudaStreams(streams), (int_erc20_buffer<uint64_t> **)mem_ptr,
+      lwe_ciphertext_count, params, allocate_gpu_memory);
+  POP_RANGE()
+  return ret;
+}
+
+void cuda_erc20_assign_64(CudaStreamsFFI streams,
+                          CudaRadixCiphertextFFI *from_amount,
+                          CudaRadixCiphertextFFI *to_amount,
+                          CudaRadixCiphertextFFI const *amount, int8_t *mem_ptr,
+                          void *const *bsks, void *const *ksks) {
+  PUSH_RANGE("erc20")
+  auto mem = reinterpret_cast<int_erc20_buffer<uint64_t> *>(mem_ptr);
+  switch (mem->params.polynomial_size) {
+  case 256:
+    host_erc20_assign<uint64_t, AmortizedDegree<256>>(
+        CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
+        (uint64_t **)(ksks));
+    break;
+  case 512:
+    host_erc20_assign<uint64_t, AmortizedDegree<512>>(
+        CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
+        (uint64_t **)(ksks));
+    break;
+  case 1024:
+    host_erc20_assign<uint64_t, AmortizedDegree<1024>>(
+        CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
+        (uint64_t **)(ksks));
+    break;
+  case 2048:
+    host_erc20_assign<uint64_t, AmortizedDegree<2048>>(
+        CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
+        (uint64_t **)(ksks));
+    break;
+  case 4096:
+    host_erc20_assign<uint64_t, AmortizedDegree<4096>>(
+        CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
+        (uint64_t **)(ksks));
+    break;
+  case 8192:
+    host_erc20_assign<uint64_t, AmortizedDegree<8192>>(
+        CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
+        (uint64_t **)(ksks));
+    break;
+  case 16384:
+    host_erc20_assign<uint64_t, AmortizedDegree<16384>>(
+        CudaStreams(streams), from_amount, to_amount, amount, mem, bsks,
+        (uint64_t **)(ksks));
+    break;
+  default:
+    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
+          "Supported N's are powers of two in the interval [256..16384].")
+  }
+  POP_RANGE()
+}
+
+void cleanup_cuda_erc20(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup erc20")
+  int_erc20_buffer<uint64_t> *mem_ptr =
+      (int_erc20_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(CudaStreams(streams));
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+  POP_RANGE()
+}
--- a/backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/erc20/erc20.cuh
@@ -0,0 +1,49 @@
+#pragma once
+#include "erc20/erc20.h"
+#include "erc20/erc20_utilities.h"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/multiplication.cuh"
+#include "integer/subtraction.cuh"
+
+template <typename Torus, class params>
+__host__ void host_erc20_assign(CudaStreams streams,
+                                CudaRadixCiphertextFFI *from_amount,
+                                CudaRadixCiphertextFFI *to_amount,
+                                CudaRadixCiphertextFFI const *amount,
+                                int_erc20_buffer<Torus> *mem_ptr,
+                                void *const *bsks, Torus *const *ksks) {
+  auto num_radix_blocks = from_amount->num_radix_blocks;
+  host_difference_check<Torus>(streams, mem_ptr->has_enough_funds, from_amount,
+                               amount, mem_ptr->diff_buffer,
+                               mem_ptr->diff_buffer->diff_buffer->operator_f,
+                               bsks, ksks, num_radix_blocks);
+  host_integer_mult_radix<Torus, params>(
+      streams, mem_ptr->tmp_amount, amount, false, mem_ptr->has_enough_funds,
+      true, bsks, ksks, mem_ptr->mul_buffer, num_radix_blocks);
+
+  mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
+      streams);
+  // stream1
+  host_add_and_propagate_single_carry(
+      mem_ptr->internal_cuda_streams[0], to_amount, mem_ptr->tmp_amount,
+      nullptr, nullptr, mem_ptr->add_buffer, bsks, ksks, FLAG_NONE, 0);
+  // stream2
+  host_sub_and_propagate_single_carry(
+      mem_ptr->internal_cuda_streams[1], to_amount, mem_ptr->tmp_amount,
+      nullptr, nullptr, mem_ptr->sub_buffer, bsks, ksks, FLAG_NONE, 0);
+  mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
+      streams);
+}
+
+template <typename Torus>
+__host__ uint64_t scratch_cuda_erc20(CudaStreams streams,
+                                     int_erc20_buffer<Torus> **mem_ptr,
+                                     uint32_t num_radix_blocks,
+                                     int_radix_params params,
+                                     bool allocate_gpu_memory) {
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_erc20_buffer<Torus>(streams, params, num_radix_blocks,
+                                         allocate_gpu_memory, size_tracker);
+  return size_tracker;
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -134,7 +134,7 @@ __host__ void are_all_comparisons_block_true(
        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
          return x == chunk_length;
        };
-        generate_device_accumulator_with_cpu_prealloc<Torus>(
+        generate_device_accumulator<Torus>(
            streams.stream(0), streams.gpu_index(0),
            is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
            is_max_value_lut->get_max_degree(1), glwe_dimension,
@@ -482,7 +482,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    y = x;
    f = sign_handler_f;
  }
-  generate_device_accumulator_with_cpu_prealloc<Torus>(
+  generate_device_accumulator<Torus>(
      streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
      polynomial_size, message_modulus, carry_modulus, f, true,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -9,7 +9,6 @@
 #include "integer/scalar_addition.cuh"
 #include "linearalgebra/addition.cuh"
 #include "linearalgebra/negation.cuh"
-#include "pbs/pbs_128_utilities.h"
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
@@ -1040,26 +1039,41 @@ void generate_device_accumulator_bivariate(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
    uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated) {
+    std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
+    Torus *preallocated_cpu_lut) {
  PUSH_RANGE("gen bivar lut acc")

-  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-  *max_degree = message_modulus * carry_modulus - 1;
-  // fill bivariate accumulator
-  *degree = generate_lookup_table_bivariate<Torus>(
-      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-      f);
+  if (preallocated_cpu_lut == nullptr) {
+    // host lut
+    Torus *h_lut =
+        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+    *max_degree = message_modulus * carry_modulus - 1;
+    // fill bivariate accumulator
+    *degree = generate_lookup_table_bivariate<Torus>(
+        h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        f);

-  // copy host lut and lut_indexes_vec to device
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      acc_bivariate, h_lut,
-      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
-      gpu_memory_allocated);
+    // copy host lut and lut_indexes_vec to device
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        acc_bivariate, h_lut,
+        (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
+        gpu_index, gpu_memory_allocated);

-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
+    cuda_synchronize_stream(stream, gpu_index);
+    free(h_lut);
+  } else {
+    *max_degree = message_modulus * carry_modulus - 1;
+    // fill bivariate accumulator
+    *degree = generate_lookup_table_bivariate<Torus>(
+        preallocated_cpu_lut, glwe_dimension, polynomial_size, message_modulus,
+        carry_modulus, f);
+
+    // copy host lut and lut_indexes_vec to device
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        acc_bivariate, preallocated_cpu_lut,
+        (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
+        gpu_index, gpu_memory_allocated);
+  }
  POP_RANGE()
 }

@@ -1097,41 +1111,6 @@ void generate_device_accumulator_bivariate_with_factor(
  cuda_synchronize_stream(stream, gpu_index);
  free(h_lut);
 }
-/*
- *  generate bivariate accumulator for device pointer
- *  using preallocated host lut to avoid blocking the cpu thread
- *  with the stream synchronization (required to free the host lut).
- *  This enables concurrent execution of multiple streams when using
- *  a single cpu thread.
- *    stream - cuda stream
- *    acc - device pointer for bivariate accumulator
- *    ...
- *    f - wrapping function with two Torus inputs
- *    h_lut - preallocated host lut to be used
- *
- */
-template <typename Torus>
-void generate_device_accumulator_bivariate_with_cpu_prealloc(
-    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
-    uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated,
-    Torus *h_lut) {
-  PUSH_RANGE("gen bivar lut acc")
-
-  *max_degree = message_modulus * carry_modulus - 1;
-  // fill bivariate accumulator
-  *degree = generate_lookup_table_bivariate<Torus>(
-      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-      f);
-
-  // copy host lut and lut_indexes_vec to device
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      acc_bivariate, h_lut,
-      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
-      gpu_memory_allocated);
-  POP_RANGE()
-}

 template <typename Torus>
 void generate_device_accumulator_with_encoding(
@@ -1190,33 +1169,6 @@ void generate_device_accumulator_with_encoding_with_cpu_prealloc(
 */
 template <typename Torus>
 void generate_device_accumulator(
-    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
-    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus)> f, bool gpu_memory_allocated) {
-
-  PUSH_RANGE("gen lut acc")
-  generate_device_accumulator_with_encoding(
-      stream, gpu_index, acc, degree, max_degree, glwe_dimension,
-      polynomial_size, message_modulus, carry_modulus, message_modulus,
-      carry_modulus, f, gpu_memory_allocated);
-  POP_RANGE()
-}
-
-/*
- *  generate accumulator for device pointer using preallocated
- *  host lut to avoid blocking the cpu thread with the stream
- *  synchronization (required to free the host lut).
- *  This enables concurrent execution of multiple streams when using
- *  a single cpu thread.
- *    v_stream - cuda stream
- *    acc - device pointer for accumulator
- *    ...
- *    f - evaluating function with one Torus input
- *   h_lut - preallocated host lut to be used
- */
-template <typename Torus>
-void generate_device_accumulator_with_cpu_prealloc(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t carry_modulus,
@@ -1224,12 +1176,19 @@ void generate_device_accumulator_with_cpu_prealloc(
    Torus *preallocated_h_lut) {

  PUSH_RANGE("gen lut acc")
-  generate_device_accumulator_with_encoding_with_cpu_prealloc(
-      stream, gpu_index, acc, degree, max_degree, glwe_dimension,
-      polynomial_size, message_modulus, carry_modulus, message_modulus,
-      carry_modulus, f, gpu_memory_allocated, preallocated_h_lut);
+  if (preallocated_h_lut != nullptr)
+    generate_device_accumulator_with_encoding_with_cpu_prealloc(
+        stream, gpu_index, acc, degree, max_degree, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, message_modulus,
+        carry_modulus, f, gpu_memory_allocated, preallocated_h_lut);
+  else
+    generate_device_accumulator_with_encoding(
+        stream, gpu_index, acc, degree, max_degree, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, message_modulus,
+        carry_modulus, f, gpu_memory_allocated);
  POP_RANGE()
 }
+
 /*
 *  generate many lut accumulator for device pointer
 *    v_stream - cuda stream
@@ -1243,25 +1202,38 @@ void generate_many_lut_device_accumulator(
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t carry_modulus,
    std::vector<std::function<Torus(Torus)>> &functions,
-    bool gpu_memory_allocated) {
+    bool gpu_memory_allocated, Torus *preallocated_h_lut) {

  PUSH_RANGE("gen many lut acc")
-  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+  if (preallocated_h_lut == nullptr) {
+    // host lut
+    Torus *h_lut =
+        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));

-  // fill accumulator
-  *max_degree = generate_many_lookup_table<Torus>(
-      h_lut, degrees, glwe_dimension, polynomial_size, message_modulus,
-      carry_modulus, functions);
+    // fill accumulator
+    *max_degree = generate_many_lookup_table<Torus>(
+        h_lut, degrees, glwe_dimension, polynomial_size, message_modulus,
+        carry_modulus, functions);

-  // copy host lut and lut_indexes_vec to device
-  cuda_memcpy_with_size_tracking_async_to_gpu(
-      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
-      stream, gpu_index, gpu_memory_allocated);
+    // copy host lut and lut_indexes_vec to device
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
+        stream, gpu_index, gpu_memory_allocated);

-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
+    cuda_synchronize_stream(stream, gpu_index);
+    free(h_lut);
+  } else {
+    // fill accumulator
+    *max_degree = generate_many_lookup_table<Torus>(
+        preallocated_h_lut, degrees, glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, functions);
+
+    // copy host lut and lut_indexes_vec to device
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        acc, preallocated_h_lut,
+        (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream,
+        gpu_index, gpu_memory_allocated);
+  }
  POP_RANGE()
 }

@@ -1732,7 +1704,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
      signs_array_in, 0, num_sign_blocks);
  if (num_sign_blocks > 2) {
    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
+    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus,
@@ -1763,7 +1735,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
+    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
@@ -1783,7 +1755,7 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
+    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -366,7 +366,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

  mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
-                               current_blocks->degrees);
+                               current_blocks->degrees,
+                               mem_ptr->preallocated_h_lut);

  while (needs_processing) {
    auto luts_message_carry = mem_ptr->luts_message_carry;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -142,7 +142,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
+    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
@@ -235,7 +235,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
    };

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
+    generate_device_accumulator_bivariate<Torus>(
        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus,
@@ -269,7 +269,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
      int_radix_lut<Torus> *one_block_lut =
          new int_radix_lut<Torus>(streams, params, 1, 1, true, size);

-      generate_device_accumulator_with_cpu_prealloc<Torus>(
+      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
          one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
@@ -413,7 +413,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
+    generate_device_accumulator_bivariate<Torus>(
        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus,
@@ -515,7 +515,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
    };

    auto signed_msb_lut = mem_ptr->signed_msb_lut;
-    generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
+    generate_device_accumulator_bivariate<Torus>(
        msb_streams.stream(0), streams.gpu_index(0),
        signed_msb_lut->get_lut(0, 0), signed_msb_lut->get_degree(0),
        signed_msb_lut->get_max_degree(0), params.glwe_dimension,
@@ -561,7 +561,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
      int_radix_lut<Torus> *one_block_lut =
          new int_radix_lut<Torus>(streams, params, 1, 1, true, size);

-      generate_device_accumulator_with_cpu_prealloc<Torus>(
+      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
          one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2503,6 +2503,41 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_erc20_64(
+        streams: CudaStreamsFFI,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        lwe_ciphertext_count: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        noise_reduction_type: PBS_MS_REDUCTION_T,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_erc20_assign_64(
+        streams: CudaStreamsFFI,
+        from_amount: *mut CudaRadixCiphertextFFI,
+        to_amount: *mut CudaRadixCiphertextFFI,
+        amount: *const CudaRadixCiphertextFFI,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_erc20(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
+}
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;
--- a/backends/tfhe-cuda-backend/wrapper.h
+++ b/backends/tfhe-cuda-backend/wrapper.h
@@ -4,6 +4,7 @@
 #include "cuda/include/integer/integer.h"
 #include "cuda/include/integer/rerand.h"
 #include "cuda/include/aes/aes.h"
+#include "cuda/include/erc20/erc20.h"
 #include "cuda/include/zk/zk.h"
 #include "cuda/include/keyswitch/keyswitch.h"
 #include "cuda/include/keyswitch/ks_enums.h"
--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -115,6 +115,12 @@ path = "benches/integer/bench.rs"
 harness = false
 required-features = ["integer", "pbs-stats", "internal-keycache"]

+[[bench]]
+name = "integer-erc20"
+path = "benches/integer/erc20.rs"
+harness = false
+required-features = ["integer", "pbs-stats", "internal-keycache"]
+
 [[bench]]
 name = "integer-signed"
 path = "benches/integer/signed_bench.rs"
--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -42,6 +42,19 @@ where
    (new_from_amount, new_to_amount)
 }

+#[cfg(feature = "gpu")]
+pub fn transfer_backend<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: FheErc20<Output = FheType>,
+    for<'a> &'a FheType: FheErc20<Output = FheType>,
+{
+    from_amount.erc20(to_amount, amount)
+}
+
 /// Parallel variant of [`transfer_whitepaper`].
 pub fn par_transfer_whitepaper<FheType>(
    from_amount: &FheType,
@@ -965,6 +978,14 @@ fn main() {
                "transfer::no_cmux",
                transfer_no_cmux::<FheUint64>,
            );
+            cuda_bench_transfer_throughput(
+                &mut group,
+                &cks,
+                bench_name,
+                "FheUint64",
+                "transfer::backend",
+                transfer_backend::<FheUint64>,
+            );
            cuda_bench_transfer_throughput(
                &mut group,
                &cks,
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -2,14 +2,17 @@

 mod aes;
 mod aes256;
+mod erc20;
 mod oprf;

 mod rerand;

 use benchmark::params::ParamsAndNumBlocksIter;
 use benchmark::utilities::{
-    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, EnvConfig, OperatorType,
+    gen_random_u256, get_bench_type, throughput_num_threads, write_to_json, BenchmarkType,
+    EnvConfig, OperatorType,
 };
+
 use criterion::{criterion_group, Criterion, Throughput};
 use rand::prelude::*;
 use rayon::prelude::*;
@@ -26,13 +29,6 @@ use tfhe::{get_pbs_count, reset_pbs_count};
 /// It must be as big as the largest bit size tested
 type ScalarType = U256;

-fn gen_random_u256(rng: &mut ThreadRng) -> U256 {
-    let clearlow = rng.gen::<u128>();
-    let clearhigh = rng.gen::<u128>();
-
-    tfhe::integer::U256::from((clearlow, clearhigh))
-}
-
 /// Base function to bench a server key function that is a binary operation, input ciphertexts will
 /// contain non zero carries
 fn bench_server_key_binary_function_dirty_inputs<F>(
--- a/tfhe-benchmark/benches/integer/erc20.rs
+++ b/tfhe-benchmark/benches/integer/erc20.rs
@@ -0,0 +1,144 @@
+use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use benchmark::utilities::{
+    cuda_local_keys, cuda_local_streams, gen_random_u256, get_bench_type, BenchmarkType,
+};
+use criterion::{Criterion, Throughput};
+use rayon::prelude::*;
+use rayon::ThreadPoolBuilder;
+#[cfg(feature = "gpu")]
+use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
+#[cfg(feature = "gpu")]
+use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
+#[cfg(feature = "gpu")]
+use tfhe::integer::gpu::CudaServerKey;
+use tfhe::integer::keycache::KEY_CACHE;
+use tfhe::integer::IntegerKeyKind;
+use tfhe::keycache::NamedParam;
+
+fn main() {
+    let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
+    #[cfg(feature = "gpu")]
+    cuda_erc20(&mut criterion);
+
+    Criterion::default().configure_from_args().final_summary();
+}
+
+#[cfg(feature = "gpu")]
+pub fn cuda_erc20(c: &mut Criterion) {
+    let bench_name = "integer::cuda::erc20";
+
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(30));
+    let mut rng = rand::thread_rng();
+    let bench_id;
+
+    let param = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+    let param_name = param.name();
+    let num_block = 32;
+
+    match get_bench_type() {
+        BenchmarkType::Latency => {
+            let streams = CudaStreams::new_multi_gpu();
+            bench_id = format!("{bench_name}::{param_name}");
+
+            bench_group.bench_function(&bench_id, |b| {
+                let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                let gpu_sks = CudaServerKey::new(&cks, &streams);
+
+                let encrypt_values = || {
+                    let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let ct_2 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                    let d_ctxt_0 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &streams);
+                    let d_ctxt_1 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &streams);
+                    let d_ctxt_2 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_2, &streams);
+
+                    (d_ctxt_0, d_ctxt_1, d_ctxt_2)
+                };
+
+                b.iter_batched(
+                    encrypt_values,
+                    |(ct_0, ct_1, ct_2)| {
+                        gpu_sks.erc20(&ct_0, &ct_1, &ct_2, &streams);
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+        }
+        BenchmarkType::Throughput => {
+            let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+            let gpu_sks_vec = cuda_local_keys(&cks);
+            let gpu_count = get_number_of_gpus() as usize;
+
+            bench_id = format!("{bench_name}::throughput::{param_name}");
+            let elements = 800;
+            bench_group.throughput(Throughput::Elements(elements));
+            bench_group.bench_function(&bench_id, |b| {
+                let setup_encrypted_values = || {
+                    let local_streams = cuda_local_streams(num_block, elements as usize);
+                    let cts_0 = (0..elements)
+                        .map(|i| {
+                            let ct_0 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                                &ct_0,
+                                &local_streams[i as usize],
+                            )
+                        })
+                        .collect::<Vec<_>>();
+                    let cts_1 = (0..elements)
+                        .map(|i| {
+                            let ct_1 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                                &ct_1,
+                                &local_streams[i as usize],
+                            )
+                        })
+                        .collect::<Vec<_>>();
+                    let cts_2 = (0..elements)
+                        .map(|i| {
+                            let ct_2 = cks.encrypt_radix(gen_random_u256(&mut rng), num_block);
+                            CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                                &ct_2,
+                                &local_streams[i as usize],
+                            )
+                        })
+                        .collect::<Vec<_>>();
+
+                    (cts_0, cts_1, cts_2, local_streams)
+                };
+
+                let pool = ThreadPoolBuilder::new().num_threads(32).build().unwrap();
+
+                b.iter_batched(
+                    setup_encrypted_values,
+                    |(cts_0, cts_1, cts_2, local_streams)| {
+                        pool.install(|| {
+                            cts_0
+                                .par_iter()
+                                .zip(cts_1.par_iter())
+                                .zip(cts_2.par_iter())
+                                .zip(local_streams.par_iter())
+                                .enumerate()
+                                .for_each(|(i, (((ct_0, ct_1), ct_2), local_stream))| {
+                                    gpu_sks_vec[i % gpu_count].erc20(
+                                        ct_0,
+                                        ct_1,
+                                        ct_2,
+                                        local_stream,
+                                    );
+                                });
+                        })
+                    },
+                    criterion::BatchSize::SmallInput,
+                );
+            });
+        }
+    };
+
+    bench_group.finish();
+}
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -1,3 +1,5 @@
+use rand::prelude::ThreadRng;
+use rand::Rng;
 use serde::Serialize;
 use std::path::PathBuf;
 use std::sync::OnceLock;
@@ -791,3 +793,11 @@ mod cuda_utils {

 #[cfg(feature = "gpu")]
 pub use cuda_utils::*;
+use tfhe::integer::U256;
+
+pub fn gen_random_u256(rng: &mut ThreadRng) -> U256 {
+    let clearlow = rng.gen::<u128>();
+    let clearhigh = rng.gen::<u128>();
+
+    tfhe::integer::U256::from((clearlow, clearhigh))
+}
--- a/tfhe/src/high_level_api/integers/unsigned/ops.rs
+++ b/tfhe/src/high_level_api/integers/unsigned/ops.rs
@@ -17,7 +17,7 @@ use crate::high_level_api::traits::{
    RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
 };
 use crate::high_level_api::traits::{
-    DivRem, FheEq, FheMax, FheMin, FheOrd, RotateLeft, RotateLeftAssign, RotateRight,
+    DivRem, FheEq, FheErc20, FheMax, FheMin, FheOrd, RotateLeft, RotateLeftAssign, RotateRight,
    RotateRightAssign,
 };
 #[cfg(feature = "gpu")]
@@ -3206,3 +3206,68 @@ where
        })
    }
 }
+#[cfg(feature = "gpu")]
+impl<Id> FheErc20<Self> for FheUint<Id>
+where
+    Id: FheUintId,
+{
+    type Output = Self;
+
+    fn erc20(self, to: Self, amount: Self) -> (Self::Output, Self::Output) {
+        <Self as FheErc20<&Self>>::erc20(self, &to, &amount)
+    }
+}
+
+#[cfg(feature = "gpu")]
+impl<Id> FheErc20<&Self> for FheUint<Id>
+where
+    Id: FheUintId,
+{
+    type Output = Self;
+
+    fn erc20(self, to: &Self, amount: &Self) -> (Self::Output, Self::Output) {
+        <&Self as FheErc20<&Self>>::erc20(&self, to, amount)
+    }
+}
+
+#[cfg(feature = "gpu")]
+impl<Id> FheErc20<Self> for &FheUint<Id>
+where
+    Id: FheUintId,
+{
+    type Output = FheUint<Id>;
+
+    fn erc20(self, to: Self, amount: Self) -> (Self::Output, Self::Output) {
+        global_state::with_internal_keys(|key| match key {
+            InternalServerKey::Cpu(_cpu_key) => {
+                panic!("Erc20 is not supported on CPU");
+            }
+            #[cfg(feature = "gpu")]
+            InternalServerKey::Cuda(cuda_key) => {
+                let streams = &cuda_key.streams;
+                let inner_result = cuda_key.key.key.erc20(
+                    &*self.ciphertext.on_gpu(streams),
+                    &*to.ciphertext.on_gpu(streams),
+                    &*amount.ciphertext.on_gpu(streams),
+                    streams,
+                );
+                (
+                    FheUint::<Id>::new(
+                        inner_result.0,
+                        cuda_key.tag.clone(),
+                        ReRandomizationMetadata::default(),
+                    ),
+                    FheUint::<Id>::new(
+                        inner_result.1,
+                        cuda_key.tag.clone(),
+                        ReRandomizationMetadata::default(),
+                    ),
+                )
+            }
+            #[cfg(feature = "hpu")]
+            InternalServerKey::Hpu(_device) => {
+                panic!("Erc20 is not supported on HPU");
+            }
+        })
+    }
+}
--- a/tfhe/src/high_level_api/prelude.rs
+++ b/tfhe/src/high_level_api/prelude.rs
@@ -27,7 +27,7 @@ pub use crate::high_level_api::strings::traits::*;
 #[cfg(feature = "gpu")]
 pub use crate::high_level_api::traits::{
    AddSizeOnGpu, BitAndSizeOnGpu, BitNotSizeOnGpu, BitOrSizeOnGpu, BitXorSizeOnGpu,
-    DivRemSizeOnGpu, DivSizeOnGpu, FheEqSizeOnGpu, FheMaxSizeOnGpu, FheMinSizeOnGpu,
+    DivRemSizeOnGpu, DivSizeOnGpu, FheEqSizeOnGpu, FheErc20, FheMaxSizeOnGpu, FheMinSizeOnGpu,
    FheOrdSizeOnGpu, IfThenElseSizeOnGpu, MulSizeOnGpu, NegSizeOnGpu, RemSizeOnGpu,
    RotateLeftSizeOnGpu, RotateRightSizeOnGpu, ShlSizeOnGpu, ShrSizeOnGpu, SizeOnGpu, SubSizeOnGpu,
 };
--- a/tfhe/src/high_level_api/traits.rs
+++ b/tfhe/src/high_level_api/traits.rs
@@ -355,3 +355,9 @@ pub trait FheEqSizeOnGpu<Rhs = Self> {
    fn get_eq_size_on_gpu(&self, amount: Rhs) -> u64;
    fn get_ne_size_on_gpu(&self, amount: Rhs) -> u64;
 }
+#[cfg(feature = "gpu")]
+pub trait FheErc20<Rhs = Self> {
+    type Output;
+
+    fn erc20(self, to: Rhs, amount: Rhs) -> (Self::Output, Self::Output);
+}
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -10357,3 +10357,130 @@ pub(crate) unsafe fn cuda_backend_cast_to_signed<T: UnsignedInteger, B: Numeric>

    update_noise_degree(output, &cuda_ffi_output);
 }
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
+pub(crate) unsafe fn cuda_backend_erc20_assign<T: UnsignedInteger, B: Numeric>(
+    streams: &CudaStreams,
+    from_amount: &mut CudaRadixCiphertext,
+    to_amount: &mut CudaRadixCiphertext,
+    amount: &CudaRadixCiphertext,
+    bootstrapping_key: &CudaVec<B>,
+    keyswitch_key: &CudaVec<T>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    glwe_dimension: GlweDimension,
+    polynomial_size: PolynomialSize,
+    big_lwe_dimension: LweDimension,
+    small_lwe_dimension: LweDimension,
+    ks_level: DecompositionLevelCount,
+    ks_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    pbs_base_log: DecompositionBaseLog,
+    num_blocks: u32,
+    pbs_type: PBSType,
+    grouping_factor: LweBskGroupingFactor,
+    ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        from_amount.d_blocks.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first from_amount pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        from_amount.d_blocks.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        to_amount.d_blocks.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first to_amount pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        to_amount.d_blocks.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        amount.d_blocks.0.d_vec.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first amount pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        amount.d_blocks.0.d_vec.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        bootstrapping_key.gpu_index(0).get(),
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        keyswitch_key.gpu_index(0),
+        "GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}",
+        streams.gpu_indexes[0].get(),
+        keyswitch_key.gpu_index(0).get(),
+    );
+    let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
+
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    let mut from_amount_degrees = from_amount.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut from_amount_noise_levels = from_amount
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_from_amount = prepare_cuda_radix_ffi(
+        from_amount,
+        &mut from_amount_degrees,
+        &mut from_amount_noise_levels,
+    );
+    let mut amount_degrees = amount.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut amount_noise_levels = amount.info.blocks.iter().map(|b| b.noise_level.0).collect();
+    let cuda_ffi_amount =
+        prepare_cuda_radix_ffi(amount, &mut amount_degrees, &mut amount_noise_levels);
+    let mut to_amount_degrees = to_amount.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut to_amount_noise_levels = to_amount
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_to_amount = prepare_cuda_radix_ffi(
+        to_amount,
+        &mut to_amount_degrees,
+        &mut to_amount_noise_levels,
+    );
+    scratch_cuda_erc20_64(
+        streams.ffi(),
+        std::ptr::addr_of_mut!(mem_ptr),
+        glwe_dimension.0 as u32,
+        polynomial_size.0 as u32,
+        big_lwe_dimension.0 as u32,
+        small_lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        grouping_factor.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        pbs_type as u32,
+        true,
+        noise_reduction_type as u32,
+    );
+    cuda_erc20_assign_64(
+        streams.ffi(),
+        &raw mut cuda_ffi_from_amount,
+        &raw mut cuda_ffi_to_amount,
+        &raw const cuda_ffi_amount,
+        mem_ptr,
+        bootstrapping_key.ptr.as_ptr(),
+        keyswitch_key.ptr.as_ptr(),
+    );
+    cleanup_cuda_erc20(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
+    update_noise_degree(from_amount, &cuda_ffi_from_amount);
+    update_noise_degree(to_amount, &cuda_ffi_to_amount);
+}
--- a/tfhe/src/integer/gpu/server_key/radix/erc20.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/erc20.rs
@@ -0,0 +1,175 @@
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::LweBskGroupingFactor;
+use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
+use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
+use crate::integer::gpu::{cuda_backend_erc20_assign, PBSType};
+
+impl CudaServerKey {
+    pub fn unchecked_erc20_assign<T>(
+        &self,
+        from_amount: &mut T,
+        to_amount: &mut T,
+        amount: &T,
+        streams: &CudaStreams,
+    ) where
+        T: CudaIntegerRadixCiphertext,
+    {
+        let num_blocks = amount.as_ref().d_blocks.lwe_ciphertext_count().0 as u32;
+        unsafe {
+            match &self.bootstrapping_key {
+                CudaBootstrappingKey::Classic(d_bsk) => {
+                    cuda_backend_erc20_assign(
+                        streams,
+                        from_amount.as_mut(),
+                        to_amount.as_mut(),
+                        amount.as_ref(),
+                        &d_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_bsk.glwe_dimension,
+                        d_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_bsk.decomp_level_count,
+                        d_bsk.decomp_base_log,
+                        num_blocks,
+                        PBSType::Classical,
+                        LweBskGroupingFactor(0),
+                        d_bsk.ms_noise_reduction_configuration.as_ref(),
+                    );
+                }
+                CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
+                    cuda_backend_erc20_assign(
+                        streams,
+                        from_amount.as_mut(),
+                        to_amount.as_mut(),
+                        amount.as_ref(),
+                        &d_multibit_bsk.d_vec,
+                        &self.key_switching_key.d_vec,
+                        self.message_modulus,
+                        self.carry_modulus,
+                        d_multibit_bsk.glwe_dimension,
+                        d_multibit_bsk.polynomial_size,
+                        self.key_switching_key
+                            .input_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key
+                            .output_key_lwe_size()
+                            .to_lwe_dimension(),
+                        self.key_switching_key.decomposition_level_count(),
+                        self.key_switching_key.decomposition_base_log(),
+                        d_multibit_bsk.decomp_level_count,
+                        d_multibit_bsk.decomp_base_log,
+                        num_blocks,
+                        PBSType::MultiBit,
+                        d_multibit_bsk.grouping_factor,
+                        None,
+                    );
+                }
+            }
+        }
+    }
+
+    pub fn unchecked_erc20<T>(
+        &self,
+        from_amount: &T,
+        to_amount: &T,
+        amount: &T,
+        streams: &CudaStreams,
+    ) -> (T, T)
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        let mut from_amount = from_amount.duplicate(streams);
+        let mut to_amount = to_amount.duplicate(streams);
+
+        self.unchecked_erc20_assign(&mut from_amount, &mut to_amount, amount, streams);
+        (from_amount, to_amount)
+    }
+
+    pub fn erc20<T>(
+        &self,
+        from_amount: &T,
+        to_amount: &T,
+        amount: &T,
+        streams: &CudaStreams,
+    ) -> (T, T)
+    where
+        T: CudaIntegerRadixCiphertext,
+    {
+        let mut tmp_from_amount;
+        let mut tmp_to_amount;
+
+        let (from_amount, to_amount) = match (
+            from_amount.block_carries_are_empty(),
+            to_amount.block_carries_are_empty(),
+        ) {
+            (true, true) => (from_amount, to_amount),
+            (true, false) => {
+                tmp_to_amount = to_amount.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_to_amount, streams);
+                (from_amount, &tmp_to_amount)
+            }
+            (false, true) => {
+                tmp_from_amount = from_amount.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_from_amount, streams);
+                (&tmp_from_amount, to_amount)
+            }
+            (false, false) => {
+                tmp_to_amount = to_amount.duplicate(streams);
+                tmp_from_amount = from_amount.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_from_amount, streams);
+                self.full_propagate_assign(&mut tmp_to_amount, streams);
+                (&tmp_from_amount, &tmp_to_amount)
+            }
+        };
+
+        self.unchecked_erc20(from_amount, to_amount, amount, streams)
+    }
+
+    pub fn erc20_assign<T>(
+        &self,
+        from_amount: &mut T,
+        to_amount: &mut T,
+        amount: &T,
+        streams: &CudaStreams,
+    ) where
+        T: CudaIntegerRadixCiphertext,
+    {
+        let mut tmp_from_amount;
+        let mut tmp_to_amount;
+
+        let (from_amount, to_amount) = match (
+            from_amount.block_carries_are_empty(),
+            to_amount.block_carries_are_empty(),
+        ) {
+            (true, true) => (from_amount, to_amount),
+            (true, false) => {
+                tmp_to_amount = to_amount.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_to_amount, streams);
+                (from_amount, &mut tmp_to_amount)
+            }
+            (false, true) => {
+                tmp_from_amount = from_amount.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_from_amount, streams);
+                (&mut tmp_from_amount, to_amount)
+            }
+            (false, false) => {
+                tmp_to_amount = to_amount.duplicate(streams);
+                tmp_from_amount = from_amount.duplicate(streams);
+                self.full_propagate_assign(&mut tmp_from_amount, streams);
+                self.full_propagate_assign(&mut tmp_to_amount, streams);
+                (&mut tmp_from_amount, &mut tmp_to_amount)
+            }
+        };
+
+        self.unchecked_erc20_assign(from_amount, to_amount, amount, streams);
+    }
+}
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -37,6 +37,7 @@ mod bitwise_op;
 mod cmux;
 mod comparison;
 mod div_mod;
+mod erc20;
 mod even_odd;
 mod ilog2;
 mod mul;
Author	SHA1	Message	Date
Agnes Leroy	eb3b988380	Use internal streams	2025-12-05 15:28:07 +01:00
Agnes Leroy	ec19579c39	Wrap erc20 from backend to hl api	2025-12-05 15:23:08 +01:00
Agnes Leroy	8ed3b4b59d	chore(gpu): reuse CPU LUT buffer to generate accumulators	2025-12-05 15:23:07 +01:00
Agnes Leroy	20daf182f0	Experiment with erc20 in the backend	2025-12-05 15:23:07 +01:00