chore(gpu): some tests on erc20

refactor(gpu): creating benchmarks for match_value
2026-04-28 03:01:21 -04:00 · 2025-12-11 14:09:32 +01:00 · 2025-12-11 10:31:44 +01:00
333 changed files with 4647 additions and 15002 deletions
--- a/.cargo/audit.toml
+++ b/.cargo/audit.toml
@@ -2,8 +2,6 @@
 ignore = [
    # Ignoring unmaintained 'paste' advisory as it is a widely used, low-risk build dependency.
    "RUSTSEC-2024-0436",
-    # Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
-    "RUSTSEC-2025-0141",
 ]

 [output]
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -23,8 +23,6 @@ runs:
        echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
        sha256sum -c checksum
        sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
-        sudo apt-get clean
-        sudo rm -rf /var/lib/apt/lists/*
        sudo apt update
        sudo apt remove -y unattended-upgrades
        sudo apt install -y cmake-format libclang-dev
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -66,7 +66,7 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'true' # Needed to pull lfs data
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +80,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -109,7 +109,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -63,7 +63,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -71,7 +71,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            dependencies:
@@ -171,7 +171,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -219,7 +219,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        with:
          path: |
            ~/.nvm
@@ -232,7 +232,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -50,7 +50,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -58,7 +58,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            integer:
@@ -112,7 +112,7 @@ jobs:
    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -60,7 +60,7 @@ jobs:
    timeout-minutes: 1440
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -51,7 +51,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -59,7 +59,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            integer:
@@ -112,7 +112,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -72,7 +72,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -80,7 +80,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            dependencies:
@@ -182,7 +182,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -64,7 +64,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +80,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        with:
          path: |
            ~/.nvm
@@ -93,7 +93,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -1,8 +1,6 @@
 # Run benchmarks on an AWS instance and return parsed results to Slab CI bot.
 name: benchmark_cpu

-run-name: ${{ inputs.command }}::${{ inputs.bench_type}} (${{ inputs.op_flavor }}, ${{ inputs.precisions_set }}, ${{ inputs.params_type }})
-
 on:
  workflow_dispatch:
    inputs:
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -149,7 +149,7 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -223,13 +223,13 @@ jobs:
          results_type: ${{ inputs.additional_results_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -49,7 +49,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -99,13 +99,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_ct_key_sizes
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -8,13 +8,8 @@ on:
        description: "Run CPU benchmarks"
        type: boolean
        default: true
-      # GPU benchmarks are split because of resource scarcity.
-      run-gpu-integer-benchmarks:
-        description: "Run GPU integer benchmarks"
-        type: boolean
-        default: true
-      run-gpu-core-crypto-benchmarks:
-        description: "Run GPU core-crypto benchmarks"
+      run-gpu-benchmarks:
+        description: "Run GPU benchmarks"
        type: boolean
        default: true
      run-hpu-benchmarks:
@@ -57,7 +52,7 @@ jobs:
  run-benchmarks-gpu-integer:
    name: benchmark_documentation/run-benchmarks-gpu-integer
    uses: ./.github/workflows/benchmark_gpu_common.yml
-    if: inputs.run-gpu-integer-benchmarks
+    if: inputs.run-gpu-benchmarks
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
@@ -118,7 +113,7 @@ jobs:
  run-benchmarks-gpu-core-crypto:
    name: benchmark_documentation/run-benchmarks-gpu-core-crypto
    uses: ./.github/workflows/benchmark_gpu_common.yml
-    if: inputs.run-gpu-core-crypto-benchmarks
+    if: inputs.run-gpu-benchmarks
    with:
      profile: multi-h100-sxm5
      hardware_name: n3-H100-SXM5x8
@@ -138,7 +133,7 @@ jobs:
  generate-svgs-with-benchmarks-run:
    name: benchmark-documentation/generate-svgs-with-benchmarks-run
    if: ${{ always() &&
-      (inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
+      (inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks ||inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    needs: [
      run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
@@ -148,7 +143,7 @@ jobs:
    with:
      time_span_days: 5
      generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
-      generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
+      generate-gpu-svgs: ${{ inputs.run-gpu-benchmarks }}
      generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
    secrets:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -157,7 +152,7 @@ jobs:

  generate-svgs-without-benchmarks-run:
    name: benchmark-documentation/generate-svgs-without-benchmarks-run
-    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
+    if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks || inputs.run-hpu-benchmarks) &&
      inputs.generate-svgs }}
    uses: ./.github/workflows/generate_svgs.yml
    with:
@@ -180,12 +175,12 @@ jobs:
      PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'

      - name: Download SVG tables
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
        with:
          path: svg_tables
          merge-multiple: 'true'
@@ -203,7 +198,7 @@ jobs:
          echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"

      - name: Create pull-request
-        uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
+        uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
        with:
          sign-commits: true # Commit will be signed by github-actions bot
          add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -1,8 +1,6 @@
 # Run CUDA benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
 name: benchmark_gpu

-run-name: ${{ inputs.command }}::${{ inputs.bench_type}} (${{ inputs.profile }}, ${{ inputs.op_flavor }}, ${{ inputs.precisions_set }}, ${{ inputs.params_type }})
-
 on:
  workflow_dispatch:
    inputs:
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 1440 # 24 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
@@ -89,7 +89,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -123,7 +123,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
@@ -173,7 +173,7 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -175,7 +175,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -281,13 +281,13 @@ jobs:
          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -130,7 +130,7 @@ jobs:
          git lfs install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          path: tfhe-rs
          persist-credentials: false
@@ -141,7 +141,7 @@ jobs:
          ls

      - name: Checkout fhevm
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          repository: zama-ai/fhevm
          persist-credentials: 'false'
@@ -192,10 +192,10 @@ jobs:
          cargo install sqlx-cli

      - name: Install foundry
-        uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730
+        uses: foundry-rs/foundry-toolchain@50d5a8956f2e319df19e6b57539d7e2acb9f8c1e

      - name: Cache cargo
-        uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
        with:
          path: |
            ~/.cargo/registry
@@ -223,7 +223,7 @@ jobs:
        working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker

      - name: Use Node.js
-        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
          node-version: 20.x

@@ -262,7 +262,7 @@ jobs:
      - name: Upload profile artifact
        env:
          REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ env.REPORT_NAME }}
          path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}
@@ -293,13 +293,13 @@ jobs:
        working-directory: fhevm/

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${COMMIT_SHA}_${BENCHMARKS}_${{ needs.parse-inputs.outputs.profile }}
          path: fhevm/$${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -1,8 +1,6 @@
 # Run benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
 name: benchmark_hpu

-run-name: ${{ inputs.command }}::${{ inputs.bench_type}} (${{ inputs.op_flavor }}, ${{ inputs.precisions_set }})
-
 on:
  workflow_dispatch:
    inputs:
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -126,7 +126,7 @@ jobs:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -185,13 +185,13 @@ jobs:
          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
-          name: ${{ github.sha }}_${{ matrix.bench_type }}_${{ matrix.command }}_benchmarks
+          name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -50,7 +50,7 @@ jobs:
      pull-requests: write # Needed to write a comment in a pull-request
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0  # Needed to get commit hash
          persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
@@ -280,7 +280,7 @@ jobs:
          BENCH_TYPE: ${{ env.__TFHE_RS_BENCH_TYPE }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_regression_${{ env.RESULTS_FILE_SHA }} # RESULT_FILE_SHA is needed to avoid collision between matrix.command runs
          path: ${{ env.RESULTS_FILENAME }}
@@ -305,7 +305,7 @@ jobs:
      REF_NAME: ${{ github.head_ref || github.ref_name }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -55,7 +55,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -96,13 +96,13 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_fft
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -55,7 +55,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -96,13 +96,13 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_ntt
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -39,7 +39,7 @@ jobs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            wasm_bench:
@@ -91,7 +91,7 @@ jobs:
        browser: [ chrome, firefox ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -119,7 +119,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        with:
          path: |
            ~/.nvm
@@ -132,7 +132,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -153,12 +153,6 @@ jobs:
        env:
          BROWSER: ${{ matrix.browser }}

-      - name: Run benchmarks (unsafe coop)
-        run: |
-          make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
-        env:
-          BROWSER: ${{ matrix.browser }}
-
      - name: Parse results
        run: |
          make parse_wasm_benchmarks
@@ -175,13 +169,13 @@ jobs:
          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -26,7 +26,7 @@ jobs:
    name: cargo_audit/audit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -24,7 +24,7 @@ jobs:
    outputs:
      matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -140,7 +140,7 @@ jobs:
      result: ${{ steps.set_builds_result.outputs.result }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -26,7 +26,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -24,7 +24,7 @@ jobs:
        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -29,7 +29,7 @@ jobs:
      fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -37,7 +37,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            fft:
@@ -56,7 +56,7 @@ jobs:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +92,7 @@ jobs:
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -31,7 +31,7 @@ jobs:
      ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -39,7 +39,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            ntt:
@@ -87,7 +87,7 @@ jobs:
        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
          version: ${{ steps.get_zizmor.outputs.version }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@6124774845927d14c601359ab8138699fa5b70c3 # v4.0.1
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@9e9574ef04ea69da568d6249bd69539ccc704e74 # v4.0.0
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -50,7 +50,7 @@ jobs:
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -62,7 +62,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            tfhe:
@@ -92,7 +92,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -106,7 +106,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
+        uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -62,7 +62,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -43,7 +43,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'

@@ -75,22 +75,13 @@ jobs:
          DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
          DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-      - name: Upload tables
-        if: inputs.backend_comparison == false
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
-        with:
-          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
-          # This will upload all the file generated
-          path: ${{ inputs.output_filename }}*.svg
-          retention-days: 60
-
      - name: Produce backends comparison table from database
        if: inputs.backend_comparison == true
        run: |
          python3 -m pip install -r ci/data_extractor/requirements.txt
          python3 ci/data_extractor/src/data_extractor.py "${OUTPUT_FILENAME}" \
          --generate-svg \
-          --backends-comparison \
+          --backend-comparison\
          --time-span-days "${TIME_SPAN}"
        env:
          OUTPUT_FILENAME: ${{ inputs.output_filename }}
@@ -99,11 +90,10 @@ jobs:
          DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
          DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}

-      - name: Upload comparison tables
-        if: inputs.backend_comparison == true
-        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+      - name: Upload tables
+        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
        with:
-          name: ${{ github.sha }}_backends_comparison_tables
+          name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
          # This will upload all the file generated
          path: ${{ inputs.output_filename }}*.svg
          retention-days: 60
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -41,7 +41,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -23,8 +23,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # every month
-    - cron: "0 0 1 * *"
+    # every 3 months
+    - cron: "0 0 1 */3 *"

 permissions:
  contents: read
@@ -50,7 +50,7 @@ jobs:
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: single-h100
+          profile: gpu-test

      # This instance will be spawned especially for pull-request from forked repository
      - name: Start GitHub instance
@@ -79,7 +79,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -39,7 +39,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -114,7 +114,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -68,7 +68,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -65,7 +65,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -74,7 +74,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -49,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            gpu:
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -32,7 +32,7 @@ jobs:
      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -40,7 +40,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
        with:
          files_yaml: |
            hpu:
@@ -83,7 +83,7 @@ jobs:
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -53,7 +53,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -41,7 +41,7 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -52,7 +52,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -62,7 +62,7 @@ jobs:
          PACKAGE: ${{ inputs.package-name }}
        run: |
          cargo package -p "${PACKAGE}"
-      - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: crate-${{ inputs.package-name }}
          path: target/package/*.crate
@@ -93,14 +93,14 @@ jobs:
      id-token: write # Needed for OIDC token exchange on crates.io
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
        with:
          name: crate-${{ inputs.package-name }}
          path: target/package
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -64,7 +64,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -104,7 +104,7 @@ jobs:
        run: |
          cargo package -p tfhe-cuda-backend

-      - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
        with:
          name: crate-tfhe-cuda-backend
          path: target/package/*.crate
@@ -174,7 +174,7 @@ jobs:
          GCC_VERSION: ${{ matrix.gcc }}

      - name: Download artifact
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
        with:
          name: crate-tfhe-cuda-backend
          path: target/package
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -41,7 +41,6 @@ jobs:
  make-release:
    name: make_release_tfhe/make-release
    uses: ./.github/workflows/make_release_common.yml
-    if: ${{ inputs.push_to_crates }}
    with:
      package-name: "tfhe"
      dry-run: ${{ inputs.dry_run }}
@@ -60,7 +59,6 @@ jobs:
  make-release-js:
    name: make_release_tfhe/make-release-js
    needs: make-release
-    if: ${{ always() && needs.make-release.result != 'failure' }}
    runs-on: ubuntu-latest
    # For provenance of npmjs publish
    permissions:
@@ -68,7 +66,7 @@ jobs:
      id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -85,9 +83,9 @@ jobs:
          make build_web_js_api_parallel

      - name: Authenticate on NPM
-        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
        with:
-          node-version: '24'
+          node-version: '22'
          registry-url: 'https://registry.npmjs.org'

      - name: Publish web package
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -60,7 +60,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
          toolchain: stable

      - name: Checkout lattice-estimator
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
--- a/.github/workflows/pr_milestone_check.yml
+++ b/.github/workflows/pr_milestone_check.yml
@@ -1,67 +0,0 @@
-name: pr_milestone_check
-
-on:
-  pull_request:
-    types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-# external contributors workflows are manually approved
-
-jobs:
-  check-empty-milestone:
-    name: pr_milestone_check/check-empty-milestone
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone == null
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone Missing
-
-            Please assign a milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is missing. This check is failing."
-          exit 1
-
-  check-milestone-open:
-    name: pr_milestone_check/check-milestone-open
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone is closed
-
-            Please assign an open milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is closed. This check is failing."
-          exit 1
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -17,7 +17,7 @@ jobs:
      issues: read # Needed to fetch all issues
      pull-requests: write # Needed to write message and close the PR
    steps:
-      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
+      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
        with:
          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
          days-before-stale: 2
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,6 @@ target/
 **/*.rmeta
 **/Cargo.lock
 **/*.bin
-**/.DS_Store

 # Some of our bench outputs
 /tfhe/benchmarks_parameters
--- a/6
+++ b/6
@@ -11,7 +11,7 @@
 /tfhe/src/core_crypto/gpu               @agnesLeroy
 /tfhe/src/core_crypto/hpu               @zama-ai/hardware

-/tfhe/src/shortint/                     @mayeul-zama @nsarlin-zama
+/tfhe/src/shortint/                     @mayeul-zama

 /tfhe/src/integer/                      @tmontaigu
 /tfhe/src/integer/gpu                   @agnesLeroy
@@ -19,12 +19,8 @@

 /tfhe/src/high_level_api/               @tmontaigu

-/tfhe-zk-pok/                           @nsarlin-zama
-
 /tfhe-benchmark/                        @soonum

-/utils/                                 @nsarlin-zama
-
 /Makefile                               @IceTDrinker @soonum

 /mockups/tfhe-hpu-mockup                @zama-ai/hardware
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,7 +27,7 @@ rust-version = "1.91.1"
 [workspace.dependencies]
 aligned-vec = { version = "0.6", default-features = false }
 bytemuck = "1.24"
-dyn-stack = { version = "0.13", default-features = false }
+dyn-stack = { version = "0.11", default-features = false }
 itertools = "0.14"
 num-complex = "0.4"
 pulp = { version = "0.22", default-features = false }
@@ -36,8 +36,6 @@ rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = "0.2.101"
 getrandom = "0.2.8"
-# The project maintainers consider that this is the last version of the 1.3 branch, any newer version should not be trusted
-bincode = "=1.3.3"

 [profile.bench]
 lto = "fat"
--- a/57
+++ b/57
@@ -20,7 +20,7 @@ BENCH_TYPE?=latency
 BENCH_PARAM_TYPE?=classical
 BENCH_PARAMS_SET?=default
 BENCH_CUSTOM_COMMAND:=
-NODE_VERSION=24.12
+NODE_VERSION=22.6
 BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_GEN_VERSION:=$(TFHE_VERSION)
 TEST_VECTORS_DIR=apps/test-vectors
@@ -996,15 +996,6 @@ test_noise_check:
 		--features=boolean,shortint,integer -p tfhe -- noise_check \
 		--test-threads=1 --nocapture

-.PHONY: test_noise_check_gpu # Run dedicated noise and pfail check tests on gpu backend
-test_noise_check_gpu:
-	@# First run the sanity checks to make sure the atomic patterns are correct
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,gpu -p tfhe -- gpu_sanity_check
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,gpu -p tfhe -- gpu_noise_check \
-		--test-threads=1 --nocapture
-
 .PHONY: test_safe_serialization # Run the tests for safe serialization
 test_safe_serialization: install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
@@ -1300,14 +1291,13 @@ run_web_js_api_parallel: build_web_js_api_parallel setup_venv
 	--browser-path $(browser_path) \
 	--driver-path $(driver_path) \
 	--browser-kind  $(browser_kind) \
-	--server-cmd $(server_cmd) \
+	--server-cmd "npm run server" \
 	--server-workdir "$(WEB_SERVER_DIR)" \
 	--id-pattern $(filter)

 test_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
 test_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
 test_web_js_api_parallel_chrome: browser_kind = chrome
-test_web_js_api_parallel_chrome: server_cmd = "npm run server:multithreaded"
 test_web_js_api_parallel_chrome: filter = Test

 .PHONY: test_web_js_api_parallel_chrome # Run tests for the web wasm api on Chrome
@@ -1323,7 +1313,6 @@ test_web_js_api_parallel_chrome_ci: setup_venv
 test_web_js_api_parallel_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
 test_web_js_api_parallel_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
 test_web_js_api_parallel_firefox: browser_kind = firefox
-test_web_js_api_parallel_firefox: server_cmd = "npm run server:multithreaded"
 test_web_js_api_parallel_firefox: filter = Test

 .PHONY: test_web_js_api_parallel_firefox # Run tests for the web wasm api on Firefox
@@ -1354,6 +1343,7 @@ dieharder_csprng: install_dieharder build_tfhe_csprng

 .PHONY: clippy_bench # Run clippy lints on tfhe-benchmark
 clippy_bench: install_rs_check_toolchain
+	! (grep --recursive "trivial" tfhe-benchmark && echo "trivial found in benches")
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings
@@ -1454,13 +1444,6 @@ bench_integer_aes256_gpu: install_rs_check_toolchain
 	--bench integer-aes256 \
 	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
-bench_integer_trivium_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-trivium \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
-
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1580,7 +1563,6 @@ bench_pbs128_gpu: install_rs_check_toolchain
 bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
 bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
 bench_web_js_api_parallel_chrome: browser_kind = chrome
-bench_web_js_api_parallel_chrome: server_cmd = "npm run server:multithreaded"
 bench_web_js_api_parallel_chrome: filter = Bench

 .PHONY: bench_web_js_api_parallel_chrome # Run benchmarks for the web wasm api
@@ -1596,7 +1578,6 @@ bench_web_js_api_parallel_chrome_ci: setup_venv
 bench_web_js_api_parallel_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
 bench_web_js_api_parallel_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
 bench_web_js_api_parallel_firefox: browser_kind = firefox
-bench_web_js_api_parallel_firefox: server_cmd = "npm run server:multithreaded"
 bench_web_js_api_parallel_firefox: filter = Bench

 .PHONY: bench_web_js_api_parallel_firefox # Run benchmarks for the web wasm api
@@ -1609,38 +1590,6 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_parallel_firefox

-bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
-bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
-bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
-bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
-bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
-
-.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel
-
-.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_unsafe_coop_chrome
-
-bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
-bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
-bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
-bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
-bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
-
-.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel
-
-.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
-bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) bench_web_js_api_unsafe_coop_firefox
-
 .PHONY: bench_hlapi # Run benchmarks for integer operations
 bench_hlapi: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
 ```

 > [!Note]
-> Note: You need Rust version 1.91.1 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
+> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.

 > [!Note]
 > Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
--- a/apps/test-vectors/checksums.sha256
+++ b/apps/test-vectors/checksums.sha256
@@ -1,32 +1,24 @@
-08f31a47c29cc4d72ad32c0b5411fa20b3deef5b84558dd2fb892d3cdf90528a  data/toy_params/glwe_after_id_br_karatsuba.cbor
 29b6e3e7d27700004b70dca24d225816500490e2d6ee49b9af05837fd421896b  data/valid_params_128/lwe_after_spec_pbs.cbor
 2c70d1d78cc3760733850a353ace2b9c4705e840141b75841739e90e51247e18  data/valid_params_128/small_lwe_secret_key.cbor
-2fb4bb45c259b8383da10fc8f9459c40a6972c49b1696eb107f0a75640724be5  data/toy_params/lwe_after_id_pbs_karatsuba.cbor
 36c9080b636475fcacca503ce041bbfeee800fd3e1890dee559ea18defff9fe8  data/toy_params/glwe_after_id_br.cbor
 377761beeb4216cf5aa2624a8b64b8259f5a75c32d28e850be8bced3a0cdd6f5  data/toy_params/ksk.cbor
 59dba26d457f96478eda130cab5301fce86f23c6a8807de42f2a1e78c4985ca7  data/valid_params_128/lwe_ks.cbor
-5d80dd93fefae4f4f89484dfcd65bbe99cc32e7e3b0a90c33dd0d77516c0a023  data/valid_params_128/glwe_after_id_br_karatsuba.cbor
 656f0009c7834c5bcb61621e222047516054b9bc5d0593d474ab8f1c086b67a6  data/valid_params_128/lwe_after_id_pbs.cbor
 699580ca92b9c2f9e1f57fb1e312c9e8cb29714f7acdef9d2ba05f798546751f  data/toy_params/lwe_sum.cbor
 6e54ab41056984595b077baff70236d934308cf5c0c33b4482fbfb129b3756c6  data/valid_params_128/glwe_after_id_br.cbor
 70f5e5728822de05b49071efb5ec28551b0f5cc87aa709a455d8e7f04b9c96ee  data/toy_params/lwe_after_id_pbs.cbor
-76a5c52cab7fec1dc167da676c6cd39479cda6b2bb9f4e0573cb7d99c2692faa  data/valid_params_128/lwe_after_id_pbs_karatsuba.cbor
 7cc6803f5fbc3d5a1bf597f2b979ce17eecd3d6baca12183dea21022a7b65c52  data/toy_params/bsk.cbor
 7f3c40a134623b44779a556212477fea26eaed22450f3b6faeb8721d63699972  data/valid_params_128/lwe_sum.cbor
 837b3bd3245d4d0534ed255fdef896fb4fa6998a258a14543dfdadd0bfc9b6dd  data/toy_params/lwe_prod.cbor
-9ece8ca9c1436258b94e8c5e629b8722f9b18fdd415dd5209b6167a9dde8491c  data/toy_params/glwe_after_spec_br_karatsuba.cbor
 aa44aea29efd6d9e4d35a21a625d9cba155672e3f7ed3eddee1e211e62ad146b  data/valid_params_128/lwe_ms.cbor
 b7a037b9eaa88d6385167579b93e26a0cb6976d9b8967416fd1173e113bda199  data/valid_params_128/large_lwe_secret_key.cbor
-b7b8e3586128887bd682120f3e3a43156139bce5e3fe0b03284f8753a864d647  data/toy_params/lwe_after_spec_pbs_karatsuba.cbor
 bd00a8ae7494e400de5753029552ee1647efe7e17409b863a26a13b081099b8c  data/toy_params/lwe_after_spec_pbs.cbor
 c6df98676de04fe54b5ffc2eb30a82ebb706c9d7d5a4e0ed509700fec88761f7  data/toy_params/lwe_ms.cbor
 c7d5a864d5616a7d8ad50bbf40416e41e6c9b60c546dc14d4aa8fc40a418baa7  data/toy_params/large_lwe_secret_key.cbor
 c806533b325b1009db38be2f9bef5f3b2fad6b77b4c71f2855ccc9d3b4162e98  data/valid_params_128/lwe_b.cbor
 c9eb75bd2993639348a679cf48c06e3c38d1a513f48e5b0ce0047cea8cff6bbc  data/toy_params/lwe_a.cbor
-d3391969acf26dc69de0927ba279139d8d79999944069addc8ff469ad6c5ae2d  data/valid_params_128/lwe_after_spec_pbs_karatsuba.cbor
 d6da5baef0e787f6be56e218d8354e26904652602db964844156fdff08350ce6  data/toy_params/lwe_ks.cbor
 e591ab9af1b6a0aede273f9a3abb65a4c387feb5fa06a6959e9314058ca0f7e5  data/valid_params_128/ksk.cbor
-e59b002df3a9b01ad321ec51cf076fa35131ab9dbef141d1c54b717d61426c92  data/valid_params_128/glwe_after_spec_br_karatsuba.cbor
 e628354c81508a2d888016e8282df363dd12f1e19190b6475d4eb9d7ab8ae007  data/valid_params_128/glwe_after_spec_br.cbor
 e69d2d2c064fc8c0460b39191ca65338146990349954f5ec5ebd01d93610e7eb  data/valid_params_128/lwe_a.cbor
 e76c24b2a0c9a842ad13dda35473c2514f9e7d20983b5ea0759c4521a91626d9  data/valid_params_128/lwe_prod.cbor
--- a/apps/test-vectors/data/README.md
+++ b/apps/test-vectors/data/README.md
@@ -1,46 +1,43 @@
 # Test vectors for TFHE
 These test vectors are generated using [TFHE-rs](https://github.com/zama-ai/tfhe-rs), with the git tag `tfhe-test-vectors-0.2.0`.

-They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). These can be deserialized using any cbor library for any programming languages. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, the command to run is: `cbor2 --pretty toy_params/lwe_a.cbor`.
+They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). You can deserialize them using any cbor library for the language of your choice. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, run: `cbor2 --pretty toy_params/lwe_a.cbor`.

-There are 2 folders with test vectors for different parameter sets:
- `valid_params_128`: valid classical PBS parameters using a Gaussian noise distribution, providing 128-bits of security in the IND-CPA model (i.e., the probability of failure is smaller than 2^{-64}).
- `toy_params`: insecure parameters that yield smaller values to simplify the bit comparison of the results.
+You will find 2 folders with test vectors for different parameter sets:
+- `valid_params_128`: valid classical PBS parameters using a gaussian noise distribution, providing 128bits of security in the IND-CPA model and a bootstrapping probability of failure of 2^{-64}.
+- `toy_params`: insecure parameters that yield smaller values

-The values are generated to compute a keyswitch (KS) followed by a bootstrap (PBS). The cleartext inputs are 2 values, A and B defined below.
+The values are generated for the keyswitch -> bootstrap (KS-PBS) atomic pattern. The cleartext inputs are 2 values, A and B defined below.

 All the random values are generated from a fixed seed, that can be found in the `RAND_SEED` constant below. The PRNG used is the one based on the AES block cipher in counter mode, from tfhe `tfhe-csprng` crate.

-The bootstrap is applied twice, with 2 different lut, the identity lut and a specific one computing the double of the input value (i.e., f(x) = 2*x).
+The programmable bootstrap is applied twice, with 2 different lut, the identity lut and a specific one (currently a x2 operation)

 ## Vectors
 The following values are generated:

 ### Keys
-| name                   | description                                                                             | TFHE-rs type                |
-|------------------------|-----------------------------------------------------------------------------------------|-----------------------------|
-| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS                                  | `LweSecretKey<Vec<u64>>`    |
-| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS                            | `LweSecretKey<Vec<u64>>`    |
-| `ksk`                  | The keyswitching key to convert a ct from the large key to the small one                | `LweKeyswitchKey<Vec<u64>>` |
+| name                   | description                                                                           | TFHE-rs type                |
+|------------------------|---------------------------------------------------------------------------------------|-----------------------------|
+| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS                                | `LweSecretKey<Vec<u64>>`    |
+| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS                          | `LweSecretKey<Vec<u64>>`    |
+| `ksk`                  | The keyswitching key to convert a ct from the large key to the small one              | `LweKeyswitchKey<Vec<u64>>` |
 | `bsk`                  | the bootstrapping key to perform a programmable bootstrap on the keyswitched ciphertext | `LweBootstrapKey<Vec<u64>>` |


 ### Ciphertexts
-| name                 | description                                                                                         | TFHE-rs type               | Cleartext            |
-|----------------------|-----------------------------------------------------------------------------------------------------|----------------------------|----------------------|
-| `lwe_a`              | LWE Ciphertext encrypting A                                                                         | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `lwe_b`              | LWE Ciphertext encrypting B                                                                         | `LweCiphertext<Vec<u64>>`  | `B`                  |
-| `lwe_sum`            | LWE Ciphertext encrypting A plus lwe encryption of B                                                | `LweCiphertext<Vec<u64>>`  | `A+B`                |
-| `lwe_prod`           | LWE Ciphertext encrypting A times cleartext B                                                       | `LweCiphertext<Vec<u64>>`  | `A*B`                |
-| `lwe_ms`             | LWE Ciphertext encrypting A after a Modulus Switch from q to 2*N ([note](#non-native-encoding))     | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `lwe_ks`             | LWE Ciphertext encrypting A after a keyswitch from `large_lwe_secret_key` to `small_lwe_secret_key` | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `glwe_after_id_br`   | GLWE Ciphertext encrypting A after the application of the identity blind rotation on `lwe_ms`       | `GlweCiphertext<Vec<u64>>` | rotation of id LUT   |
-| `lwe_after_id_pbs`   | LWE Ciphertext encrypting A after the sample extract operation on `glwe_after_id_br`                | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `glwe_after_spec_br` | GLWE Ciphertext encrypting spec(A) after the application of the spec blind rotation on `lwe_ms`     | `GlweCiphertext<Vec<u64>>` | rotation of spec LUT |
-| `lwe_after_spec_pbs` | LWE Ciphertext encrypting spec(A) after the sample extract operation on `glwe_after_spec_br`        | `LweCiphertext<Vec<u64>>`  | `spec(A)`            |
-
-Ciphertexts with the `_karatsuba` suffix are generated using the Karatsuba polynomial multiplication algorithm in the blind rotation, while default ciphertexts are generated using an FFT multiplication.
-This makes it easier to reproduce bit exact results.
+| name                 | description                                                                                                  | TFHE-rs type               | Cleartext    |
+|----------------------|--------------------------------------------------------------------------------------------------------------|----------------------------|--------------|
+| `lwe_a`              | Lwe encryption of A                                                                                          | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `lwe_b`              | Lwe encryption of B                                                                                          | `LweCiphertext<Vec<u64>>`  | `B`          |
+| `lwe_sum`            | Lwe encryption of A plus lwe encryption of B                                                                 | `LweCiphertext<Vec<u64>>`  | `A+B`        |
+| `lwe_prod`           | Lwe encryption of A times cleartext B                                                                        | `LweCiphertext<Vec<u64>>`  | `A*B`        |
+| `lwe_ms`             | The lwe ciphertext after the modswitch part of the PBS ([note](#non-native-encoding))                        | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `lwe_ks`             | The lwe ciphertext after the keyswitch                                                                       | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `glwe_after_id_br`   | The glwe returned by the application of the identity blind rotation on the mod switched ciphertexts.         | `GlweCiphertext<Vec<u64>>` | rot id LUT   |
+| `lwe_after_id_pbs`   | The lwe returned by the application of the sample extract operation on the output of the id blind rotation   | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `glwe_after_spec_br` | The glwe returned by the application of the spec blind rotation on the mod switched ciphertexts.             | `GlweCiphertext<Vec<u64>>` | rot spec LUT |
+| `lwe_after_spec_pbs` | The lwe returned by the application of the sample extract operation on the output of the spec blind rotation | `LweCiphertext<Vec<u64>>`  | `spec(A)`    |

 ### Encodings
 #### Non native encoding
--- a/apps/test-vectors/data/toy_params/glwe_after_id_br_karatsuba.cbor
+++ b/apps/test-vectors/data/toy_params/glwe_after_id_br_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:08f31a47c29cc4d72ad32c0b5411fa20b3deef5b84558dd2fb892d3cdf90528a
-size 4679
--- a/apps/test-vectors/data/toy_params/glwe_after_spec_br_karatsuba.cbor
+++ b/apps/test-vectors/data/toy_params/glwe_after_spec_br_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9ece8ca9c1436258b94e8c5e629b8722f9b18fdd415dd5209b6167a9dde8491c
-size 4679
--- a/apps/test-vectors/data/toy_params/lwe_after_id_pbs_karatsuba.cbor
+++ b/apps/test-vectors/data/toy_params/lwe_after_id_pbs_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2fb4bb45c259b8383da10fc8f9459c40a6972c49b1696eb107f0a75640724be5
-size 2365
--- a/apps/test-vectors/data/toy_params/lwe_after_spec_pbs_karatsuba.cbor
+++ b/apps/test-vectors/data/toy_params/lwe_after_spec_pbs_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b7b8e3586128887bd682120f3e3a43156139bce5e3fe0b03284f8753a864d647
-size 2365
--- a/apps/test-vectors/data/valid_params_128/glwe_after_id_br_karatsuba.cbor
+++ b/apps/test-vectors/data/valid_params_128/glwe_after_id_br_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5d80dd93fefae4f4f89484dfcd65bbe99cc32e7e3b0a90c33dd0d77516c0a023
-size 36935
--- a/apps/test-vectors/data/valid_params_128/glwe_after_spec_br_karatsuba.cbor
+++ b/apps/test-vectors/data/valid_params_128/glwe_after_spec_br_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e59b002df3a9b01ad321ec51cf076fa35131ab9dbef141d1c54b717d61426c92
-size 36935
--- a/apps/test-vectors/data/valid_params_128/lwe_after_id_pbs_karatsuba.cbor
+++ b/apps/test-vectors/data/valid_params_128/lwe_after_id_pbs_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:76a5c52cab7fec1dc167da676c6cd39479cda6b2bb9f4e0573cb7d99c2692faa
-size 18493
--- a/apps/test-vectors/data/valid_params_128/lwe_after_spec_pbs_karatsuba.cbor
+++ b/apps/test-vectors/data/valid_params_128/lwe_after_spec_pbs_karatsuba.cbor
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d3391969acf26dc69de0927ba279139d8d79999944069addc8ff469ad6c5ae2d
-size 18493
--- a/apps/test-vectors/src/main.rs
+++ b/apps/test-vectors/src/main.rs
@@ -265,7 +265,6 @@ fn generate_test_vectors<P: AsRef<Path>>(

    let mut id_lut = encoding.encode_lut(glwe_dimension, polynomial_size, ID_LUT);
    assert_data_not_zero(&id_lut);
-    let mut id_lut_karatsuba = id_lut.clone();

    blind_rotate_assign(&modswitched, &mut id_lut, &fourier_bsk);
    assert_data_not_zero(&id_lut);
@@ -288,32 +287,8 @@ fn generate_test_vectors<P: AsRef<Path>>(
    assert_data_not_zero(&lwe_pbs_id);
    store_data(path, &lwe_pbs_id, "lwe_after_id_pbs");

-    blind_rotate_karatsuba_assign(&modswitched, &mut id_lut_karatsuba, &bsk);
-    store_data(path, &id_lut_karatsuba, "glwe_after_id_br_karatsuba");
-
-    let mut lwe_pbs_karatsuba_id = LweCiphertext::new(
-        0u64,
-        glwe_dimension
-            .to_equivalent_lwe_dimension(polynomial_size)
-            .to_lwe_size(),
-        encoding.ciphertext_modulus,
-    );
-
-    extract_lwe_sample_from_glwe_ciphertext(
-        &id_lut_karatsuba,
-        &mut lwe_pbs_karatsuba_id,
-        MonomialDegree(0),
-    );
-
-    let decrypted_pbs_id = decrypt_lwe_ciphertext(&large_lwe_secret_key, &lwe_pbs_karatsuba_id);
-    let res = encoding.decode(decrypted_pbs_id);
-
-    assert_eq!(res, MSG_A);
-    store_data(path, &lwe_pbs_karatsuba_id, "lwe_after_id_pbs_karatsuba");
-
    let mut spec_lut = encoding.encode_lut(glwe_dimension, polynomial_size, SPEC_LUT);
    assert_data_not_zero(&spec_lut);
-    let mut spec_lut_karatsuba = spec_lut.clone();

    blind_rotate_assign(&modswitched, &mut spec_lut, &fourier_bsk);
    assert_data_not_zero(&spec_lut);
@@ -335,33 +310,6 @@ fn generate_test_vectors<P: AsRef<Path>>(
    assert_eq!(res, SPEC_LUT(MSG_A));
    assert_data_not_zero(&lwe_pbs_spec);
    store_data(path, &lwe_pbs_spec, "lwe_after_spec_pbs");
-
-    blind_rotate_karatsuba_assign(&modswitched, &mut spec_lut_karatsuba, &bsk);
-    store_data(path, &spec_lut_karatsuba, "glwe_after_spec_br_karatsuba");
-
-    let mut lwe_pbs_karatsuba_spec = LweCiphertext::new(
-        0u64,
-        glwe_dimension
-            .to_equivalent_lwe_dimension(polynomial_size)
-            .to_lwe_size(),
-        encoding.ciphertext_modulus,
-    );
-
-    extract_lwe_sample_from_glwe_ciphertext(
-        &spec_lut_karatsuba,
-        &mut lwe_pbs_karatsuba_spec,
-        MonomialDegree(0),
-    );
-
-    let decrypted_pbs_spec = decrypt_lwe_ciphertext(&large_lwe_secret_key, &lwe_pbs_karatsuba_spec);
-    let res = encoding.decode(decrypted_pbs_spec);
-
-    assert_eq!(res, SPEC_LUT(MSG_A));
-    store_data(
-        path,
-        &lwe_pbs_karatsuba_spec,
-        "lwe_after_spec_pbs_karatsuba",
-    );
 }

 fn rm_dir_except_readme<P: AsRef<Path>>(dir: P) {
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.13.0"
+version = "0.12.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -86,7 +86,6 @@ fn main() {
            "cuda/include/integer/integer.h",
            "cuda/include/integer/rerand.h",
            "cuda/include/aes/aes.h",
-            "cuda/include/trivium/trivium.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -29,13 +29,14 @@ template <typename Torus> struct int_aes_lut_buffers {
        allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return a & b; };
-
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
    auto active_streams_and_lut = streams.active_gpu_subset(
-        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
-        params.pbs_type);
-    this->and_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
-
+        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
+    this->and_lut->broadcast_lut(active_streams_and_lut);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->flush_lut = new int_radix_lut<Torus>(
@@ -44,11 +45,14 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
      return x & 1;
    };
-
-    auto active_streams_flush_lut = streams.active_gpu_subset(
-        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
-    this->flush_lut->generate_and_broadcast_lut(
-        active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+    auto active_streams_flush_lut =
+        streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
+    this->flush_lut->broadcast_lut(active_streams_flush_lut);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->carry_lut = new int_radix_lut<Torus>(
@@ -56,11 +60,13 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
      return (x >> 1) & 1;
    };
-
-    auto active_streams_carry_lut =
-        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
-    this->carry_lut->generate_and_broadcast_lut(
-        active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
+        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_lambda, allocate_gpu_memory);
+    auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
+    this->carry_lut->broadcast_lut(active_streams_carry_lut);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -35,9 +35,17 @@ void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
                                     uint32_t lwe_dimension,
                                     uint32_t log_modulus);

+void cuda_improve_noise_modulus_switch_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus);
+
 void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
    uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
 }
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,8 +8,7 @@

 extern std::mutex m;
 extern bool p2p_enabled;
-extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
-extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+extern const int THRESHOLD_MULTI_GPU;

 extern "C" {
 int32_t cuda_setup_multi_gpu(int device_0_id);
@@ -40,8 +39,7 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
  }
 }

-uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
-                              PBS_TYPE pbs_type);
+uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -75,10 +73,9 @@ public:

  // Returns a subset of this set as an active subset. An active subset is one
  // that is temporarily used to perform some computation
-  CudaStreams active_gpu_subset(int num_radix_blocks, PBS_TYPE pbs_type) {
-    return CudaStreams(
-        _streams, _gpu_indexes,
-        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
+  CudaStreams active_gpu_subset(int num_radix_blocks) {
+    return CudaStreams(_streams, _gpu_indexes,
+                       get_active_gpu_count(num_radix_blocks, _gpu_count));
  }

  // Returns a CudaStreams struct containing only the ith stream
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -20,8 +20,7 @@ template <typename Torus> struct boolean_bitop_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
-    auto active_streams =
-        streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
    this->unchecked = is_unchecked;
    switch (op) {
    case BITAND:
@@ -65,8 +64,14 @@ template <typename Torus> struct boolean_bitop_buffer {
        return x % params.message_modulus;
      };

-      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
+      message_extract_lut->broadcast_lut(active_streams);
    }
    tmp_lwe_left = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -114,8 +119,7 @@ template <typename Torus> struct int_bitop_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
    switch (op) {
    case BITAND:
    case BITOR:
@@ -136,8 +140,12 @@ template <typename Torus> struct int_bitop_buffer {
          }
        };

-        lut->generate_and_broadcast_bivariate_lut(
-            active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
+        generate_device_accumulator_bivariate<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
      }
      break;
    default:
@@ -146,8 +154,6 @@ template <typename Torus> struct int_bitop_buffer {
                                     num_radix_blocks, allocate_gpu_memory,
                                     size_tracker);

-      std::vector<std::function<Torus(Torus)>> lut_funcs;
-      std::vector<uint32_t> lut_indices;
      for (int i = 0; i < params.message_modulus; i++) {
        auto rhs = i;

@@ -163,13 +169,14 @@ template <typename Torus> struct int_bitop_buffer {
            return x ^ rhs;
          }
        };
-
-        lut_funcs.push_back(lut_univariate_scalar_f);
-        lut_indices.push_back(i);
+        generate_device_accumulator<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
+            lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_univariate_scalar_f,
+            gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
      }
-
-      lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
-                                      gpu_memory_allocated);
    }
  }

@@ -202,11 +209,15 @@ template <typename Torus> struct boolean_bitnot_buffer {
        return x % message_modulus;
      };

-      auto active_streams =
-          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
-
-      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
+      auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
+      message_extract_lut->broadcast_lut(active_streams);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -28,17 +28,20 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
      uint32_t bits_per_block = std::log2(params.message_modulus);
      uint32_t msg_modulus = params.message_modulus;

-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-
-      lut->generate_and_broadcast_lut(
-          active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          [msg_modulus, bits_per_block](Torus x) {
            const auto xm = x % msg_modulus;
            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
            return (Torus)((msg_modulus - 1) * sign_bit);
-          }},
+          },
          allocate_gpu_memory);

+      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      lut->broadcast_lut(active_streams);
+
      this->last_block = new CudaRadixCiphertextFFI;

      create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -14,8 +14,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
                         uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);

    tmp = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -85,6 +84,24 @@ template <typename Torus> struct int_cmux_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
+        predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
+        predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
+        message_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        message_extract_lut_f, gpu_memory_allocated);
    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
    for (int index = 0; index < 2 * num_radix_blocks; index++) {
      if (index < num_radix_blocks) {
@@ -97,18 +114,10 @@ template <typename Torus> struct int_cmux_buffer {
        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-
-    auto active_streams_pred =
-        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
-    predicate_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
-        gpu_memory_allocated);
-
-    auto active_streams_msg =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-
-    message_extract_lut->generate_and_broadcast_lut(
-        active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
+    auto active_streams_pred = streams.active_gpu_subset(2 * num_radix_blocks);
+    predicate_lut->broadcast_lut(active_streams_pred);
+    auto active_streams_msg = streams.active_gpu_subset(num_radix_blocks);
+    message_extract_lut->broadcast_lut(active_streams_msg);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -39,21 +39,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

-    preallocated_h_lut = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
-
    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
                                            allocate_gpu_memory, size_tracker);
-
-    auto active_streams =
-        streams.active_gpu_subset(max_chunks, params.pbs_type);
-
    auto is_max_value_f = [max_value](Torus x) -> Torus {
      return x == max_value;
    };
+    preallocated_h_lut = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
+        is_max_value->get_degree(0), is_max_value->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_max_value_f, gpu_memory_allocated);

-    is_max_value->generate_and_broadcast_lut(
-        active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
+    auto active_streams = streams.active_gpu_subset(max_chunks);
+    is_max_value->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
@@ -102,10 +102,14 @@ template <typename Torus> struct int_comparison_eq_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    is_non_zero_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
+        is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
+
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    is_non_zero_lut->broadcast_lut(active_streams);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -123,28 +127,32 @@ template <typename Torus> struct int_comparison_eq_buffer {
        return (lhs == rhs);
      }
    };
-
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < total_modulus; i++) {
      auto lut_f = [i, operator_f](Torus x) -> Torus {
        return operator_f(i, x);
      };
-      lut_funcs.push_back(lut_f);
-      lut_indices.push_back(i);
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          scalar_comparison_luts->get_lut(0, i),
+          scalar_comparison_luts->get_degree(i),
+          scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f, gpu_memory_allocated);
    }
-
-    scalar_comparison_luts->generate_and_broadcast_lut(
-        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
-
+    scalar_comparison_luts->broadcast_lut(active_streams);
    if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
      operator_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-      operator_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {operator_f}, gpu_memory_allocated);
-      // operator_lut->broadcast_lut(active_streams);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
+          operator_lut->get_degree(0), operator_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, operator_f, gpu_memory_allocated);
+
+      operator_lut->broadcast_lut(active_streams);
    } else {
      operator_lut = nullptr;
    }
@@ -211,6 +219,9 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    // LUTs
+    tree_inner_leaf_lut =
+        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
+                                 allocate_gpu_memory, size_tracker);

    tree_last_leaf_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -221,14 +232,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    tree_inner_leaf_lut =
-        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
-                                 allocate_gpu_memory, size_tracker);
-
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
+        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        block_selector_f, gpu_memory_allocated);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    tree_inner_leaf_lut->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
@@ -379,8 +390,7 @@ template <typename Torus> struct int_comparison_buffer {
    this->op = op;
    this->is_signed = is_signed;

-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);

    identity_lut_f = [](Torus x) -> Torus { return x; };

@@ -412,8 +422,12 @@ template <typename Torus> struct int_comparison_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    identity_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
+        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
+    identity_lut->broadcast_lut(active_streams);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -423,8 +437,13 @@ template <typename Torus> struct int_comparison_buffer {
    is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                           allocate_gpu_memory, size_tracker);

-    is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
-                                            gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
+        is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_zero_f, gpu_memory_allocated);
+
+    is_zero_lut->broadcast_lut(active_streams);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -499,9 +518,13 @@ template <typename Torus> struct int_comparison_buffer {
        PANIC("Cuda error: sign_lut creation failed due to wrong function.")
      };

-      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      signed_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
+          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
+      auto active_streams = streams.active_gpu_subset(1);
+      signed_lut->broadcast_lut(active_streams);
    }
    preallocated_h_lut = (Torus *)malloc(
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -65,16 +65,6 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,

 void cleanup_cuda_integer_decompress_radix_ciphertext_128(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);
-
-void cuda_integer_extract_glwe_128(
-    CudaStreamsFFI streams, void *glwe_array_out,
-    CudaPackedGlweCiphertextListFFI const *glwe_list,
-    uint32_t const glwe_index);
-
-void cuda_integer_extract_glwe_64(
-    CudaStreamsFFI streams, void *glwe_array_out,
-    CudaPackedGlweCiphertextListFFI const *glwe_list,
-    uint32_t const glwe_index);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -116,8 +116,7 @@ template <typename Torus> struct int_decompression {
          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
          decompression_rescale_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(
-          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
+      auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
      decompression_rescale_lut->broadcast_lut(active_streams);
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -283,9 +283,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                     zero_out_if_not_1_lut_2};
    size_t lut_gpu_indexes[2] = {0, 3};
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
-                                          {0}, {zero_out_if_not_1_lut_f},
-                                          gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(lut_gpu_indexes[j]),
+          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
    }

    luts[0] = zero_out_if_not_2_lut_1;
@@ -293,9 +296,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    lut_gpu_indexes[0] = 1;
    lut_gpu_indexes[1] = 2;
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
-                                          {0}, {zero_out_if_not_2_lut_f},
-                                          gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(lut_gpu_indexes[j]),
+          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
    }

    quotient_lut_1 =
@@ -315,12 +321,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    };
    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };

-    quotient_lut_1->generate_and_broadcast_lut(
-        streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
-    quotient_lut_2->generate_and_broadcast_lut(
-        streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
-    quotient_lut_3->generate_and_broadcast_lut(
-        streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
+        quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
+        quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
+        quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);

    message_extract_lut_1 = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -335,12 +350,14 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    luts[0] = message_extract_lut_1;
    luts[1] = message_extract_lut_2;

-    auto active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
-
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
+      auto active_streams = streams.active_gpu_subset(num_blocks);
+      luts[j]->broadcast_lut(active_streams);
    }
  }

@@ -989,14 +1006,23 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      masking_luts_2[i] = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
-      masking_luts_1[i]->generate_and_broadcast_lut(
-          active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
+          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
+      auto active_streams_1 = streams.active_gpu_subset(1);
+      masking_luts_1[i]->broadcast_lut(active_streams_1);

-      auto active_streams_2 =
-          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      masking_luts_2[i]->generate_and_broadcast_lut(
-          active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
+          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
+      auto active_streams_2 = streams.active_gpu_subset(num_blocks);
+      masking_luts_2[i]->broadcast_lut(active_streams_2);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1014,12 +1040,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
-
-    auto active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_blocks);
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
+      luts[j]->broadcast_lut(active_streams);
    }

    // Give name to closures to improve readability
@@ -1100,8 +1128,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
-    auto active_gpu_count_for_bits =
-        streams.active_gpu_subset(1, params.pbs_type);
+    auto active_gpu_count_for_bits = streams.active_gpu_subset(1);
    for (int i = 0; i < num_bits_in_message; i++) {
      auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
        return (x == 0 && y == 0) << i;
@@ -1110,8 +1137,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
-          active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          merge_overflow_flags_luts[i]->get_lut(0, 0),
+          merge_overflow_flags_luts[i]->get_degree(0),
+          merge_overflow_flags_luts[i]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_bit, gpu_memory_allocated);
+      merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
    }
  }

@@ -1119,8 +1152,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                              uint32_t num_blocks, bool allocate_gpu_memory,
                              uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
-    auto active_streams =
-        streams.active_gpu_subset(2 * num_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(2 * num_blocks);
    this->params = params;

    if (params.message_modulus == 4 && params.carry_modulus == 4 &&
@@ -1441,8 +1473,7 @@ template <typename Torus> struct int_div_rem_memory {
                     bool allocate_gpu_memory, uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);
    this->params = params;
    this->is_signed = is_signed;

@@ -1520,12 +1551,16 @@ template <typename Torus> struct int_div_rem_memory {
      compare_signed_bits_lut = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          compare_signed_bits_lut->get_lut(0, 0),
+          compare_signed_bits_lut->get_degree(0),
+          compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
-          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
-
-      compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
-          active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
-          gpu_memory_allocated);
+          streams.active_gpu_subset(1); // only 1 block needed
+      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -20,8 +20,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
    this->allocate_gpu_memory = allocate_gpu_memory;
    this->direction = direction;
    this->bit_value = bit_value;
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
    this->univ_lut_mem =
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);
@@ -53,8 +52,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return count;
    };

-    univ_lut_mem->generate_and_broadcast_lut(
-        active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
+        univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
+
+    univ_lut_mem->broadcast_lut(active_streams);

    auto generate_bi_lut_lambda =
        [num_bits](Torus block_num_bit_count,
@@ -65,8 +69,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return 0;
    };

-    biv_lut_mem->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
+        biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
+
+    biv_lut_mem->broadcast_lut(active_streams);

    this->tmp_ct = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -222,7 +231,7 @@ template <typename Torus> struct int_ilog2_buffer {
        this->sum_output_not_propagated, counter_num_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

-    lut_message_not =
+    this->lut_message_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
                                 allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus)> lut_message_lambda =
@@ -230,11 +239,15 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t message = x % this->params.message_modulus;
      return (~message) % this->params.message_modulus;
    };
-
-    auto active_streams =
-        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
-    lut_message_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);
+    generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
+                                this->lut_message_not->get_lut(0, 0),
+                                this->lut_message_not->get_degree(0),
+                                this->lut_message_not->get_max_degree(0),
+                                params.glwe_dimension, params.polynomial_size,
+                                params.message_modulus, params.carry_modulus,
+                                lut_message_lambda, allocate_gpu_memory);
+    auto active_streams = streams.active_gpu_subset(counter_num_blocks);
+    lut_message_not->broadcast_lut(active_streams);

    this->lut_carry_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -244,8 +257,13 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t carry = x / this->params.message_modulus;
      return (~carry) % this->params.message_modulus;
    };
-    lut_carry_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0),
+        this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
+        this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_carry_lambda, allocate_gpu_memory);
+    lut_carry_not->broadcast_lut(active_streams);

    this->message_blocks_not = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -883,10 +883,6 @@ void cuda_unchecked_first_index_in_clears_64(
    uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
    int8_t *mem, void *const *bsks, void *const *ksks);

-void cuda_small_scalar_multiplication_integer_64_inplace(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint64_t scalar,
-    const uint32_t message_modulus, const uint32_t carry_modulus);
-
 void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);

--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -9,7 +9,6 @@
 #include "utils/helper_multi_gpu.cuh"
 #include <cmath>
 #include <functional>
-#include <map>
 #include <queue>

 #include <stdio.h>
@@ -44,8 +43,6 @@ public:
              "parameters");                                                   \
    } else if ((msg_mod) == 0 && (carry_mod) == 0) {                           \
      break;                                                                   \
-    } else if ((msg_mod) == 4 && (carry_mod) == 32) {                          \
-      break;                                                                   \
    } else {                                                                   \
      PANIC("Invalid message modulus or carry modulus")                        \
    }                                                                          \
@@ -372,8 +369,7 @@ struct int_radix_lut_custom_input_output {
    this->num_input_blocks = num_input_blocks;
    this->gpu_memory_allocated = allocate_gpu_memory;

-    this->active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_radix_blocks);
  }

  void setup_degrees() {
@@ -384,18 +380,14 @@ struct int_radix_lut_custom_input_output {

  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
-
-    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
-                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
-
    for (uint i = 0; i < active_streams.count(); i++) {
      cuda_set_device(active_streams.gpu_index(i));
      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu = std::min(
-          (int)num_radix_blocks,
-          std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
-                                                    active_streams.count())));
+      auto num_blocks_on_gpu =
+          std::min((int)num_radix_blocks,
+                   std::max(THRESHOLD_MULTI_GPU,
+                            get_num_inputs_on_gpu(num_radix_blocks, i,
+                                                  active_streams.count())));

      uint64_t size = 0;
      execute_scratch_pbs<OutputTorus>(
@@ -430,22 +422,18 @@ struct int_radix_lut_custom_input_output {
    /// back to the original indexing
    multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
                              num_radix_blocks, params.big_lwe_dimension + 1,
-                              size_tracker, params.pbs_type,
-                              allocate_gpu_memory);
+                              size_tracker, allocate_gpu_memory);
    multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
                              num_radix_blocks, params.small_lwe_dimension + 1,
-                              size_tracker, params.pbs_type,
-                              allocate_gpu_memory);
+                              size_tracker, allocate_gpu_memory);
    if (num_many_lut > 1) {
      multi_gpu_alloc_lwe_many_lut_output_async(
          active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
-          params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
-          allocate_gpu_memory);
+          params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
    } else {
      multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
                                num_radix_blocks, params.big_lwe_dimension + 1,
-                                size_tracker, params.pbs_type,
-                                allocate_gpu_memory);
+                                size_tracker, allocate_gpu_memory);
    }
    multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
                                num_radix_blocks, size_tracker,
@@ -461,14 +449,12 @@ struct int_radix_lut_custom_input_output {
  }

  void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
-    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
-                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

-    auto inputs_on_gpu = std::min(
-        (int)num_input_blocks,
-        std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
-                                                  active_streams.count())));
+    auto inputs_on_gpu =
+        std::min((int)num_input_blocks,
+                 std::max(THRESHOLD_MULTI_GPU,
+                          get_num_inputs_on_gpu(num_input_blocks, 0,
+                                                active_streams.count())));

    if (inputs_on_gpu >= get_threshold_ks_gemm()) {
      for (auto i = 0; i < active_streams.count(); ++i) {
@@ -810,20 +796,16 @@ struct int_radix_lut_custom_input_output {
  void allocate_lwe_vector_for_non_trivial_indexes(
      CudaStreams streams, uint64_t max_num_radix_blocks,
      uint64_t &size_tracker, bool allocate_gpu_memory) {
-
-    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
-                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
-
    // We need to create the auxiliary array only in GPU 0
    if (active_streams.count() > 1) {
      lwe_aligned_vec.resize(active_streams.count());
      for (uint i = 0; i < active_streams.count(); i++) {
        uint64_t size_tracker_on_array_i = 0;
-        auto inputs_on_gpu = std::min(
-            (int)max_num_radix_blocks,
-            std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
-                                                      active_streams.count())));
+        auto inputs_on_gpu =
+            std::min((int)max_num_radix_blocks,
+                     std::max(THRESHOLD_MULTI_GPU,
+                              get_num_inputs_on_gpu(max_num_radix_blocks, i,
+                                                    active_streams.count())));
        InputTorus *d_array =
            (InputTorus *)cuda_malloc_with_size_tracking_async(
                inputs_on_gpu * (params.big_lwe_dimension + 1) *
@@ -836,56 +818,6 @@ struct int_radix_lut_custom_input_output {
    }
  }

-  void generate_and_broadcast_lut(
-      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
-      std::vector<std::function<OutputTorus(OutputTorus)>> f,
-      bool gpu_memory_allocated) {
-    // streams should be a subset of active_streams
-
-    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
-      generate_device_accumulator<OutputTorus>(
-          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
-          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, f[i], gpu_memory_allocated);
-    }
-   //broadcast_lut(streams);
-  }
-
-  void generate_and_broadcast_bivariate_lut(
-      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
-      std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
-      bool gpu_memory_allocated) {
-    // streams should be a subset of active_streams
-
-    /*    for (int fidx = 0; fidx < f.size(); ++fidx) {
-          __int128_t f_hash = 0;
-          uint32_t bits_per_lut_val = 5;
-          uint32_t input_modulus_sup =
-              params.message_modulus * params.carry_modulus;
-          for (uint32_t i = 0; i < input_modulus_sup; ++i) {
-            OutputTorus f_eval =
-                f[fidx](i / params.message_modulus, i % params.message_modulus);
-            GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
-                       "LUT value expected bitwidth overflow");
-            f_hash |= f_eval;
-            f_hash <<= bits_per_lut_val;
-          }
-          printf("%016llX%016llX\n",
-                 (unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
-                 (unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
-        }
-    */
-    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
-      generate_device_accumulator_bivariate<InputTorus>(
-          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
-          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, f[i], gpu_memory_allocated);
-    }
-    //broadcast_lut(streams);
-  }
-
  void release(CudaStreams streams) {
    PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
                   "Lut vec and Lut vec indexes must have the same size");
@@ -1036,15 +968,18 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                                   bits_per_block * num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-    std::vector<std::function<Torus(Torus)>> lut_funs;
-    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < bits_per_block; i++) {
+
      auto operator_f = [i, final_offset](Torus x) -> Torus {
        Torus y = (x >> i) & 1;
        return y << final_offset;
      };
-      lut_funs.push_back(operator_f);
-      lut_indices.push_back(i);
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
+          lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          operator_f, gpu_memory_allocated);
    }

    /**
@@ -1061,12 +996,9 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
        num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);

-    auto active_streams = streams.active_gpu_subset(
-        bits_per_block * num_radix_blocks, params.pbs_type);
-
-    lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
-                                    gpu_memory_allocated);
-    // lut->broadcast_lut(active_streams);
+    auto active_streams =
+        streams.active_gpu_subset(bits_per_block * num_radix_blocks);
+    lut->broadcast_lut(active_streams);

    /**
     * the input indexes should take the first bits_per_block PBS to target
@@ -1142,6 +1074,24 @@ template <typename Torus> struct int_fullprop_buffer {
    };

    //
+    Torus *lut_buffer_message = lut->get_lut(0, 0);
+    uint64_t *message_degree = lut->get_degree(0);
+    uint64_t *message_max_degree = lut->get_max_degree(0);
+    Torus *lut_buffer_carry = lut->get_lut(0, 1);
+    uint64_t *carry_degree = lut->get_degree(1);
+    uint64_t *carry_max_degree = lut->get_max_degree(1);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut_buffer_message,
+        message_degree, message_max_degree, params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_f_message, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
+        carry_max_degree, params.glwe_dimension, params.polynomial_size,
+        params.message_modulus, params.carry_modulus, lut_f_carry,
+        gpu_memory_allocated);

    uint64_t lwe_indexes_size = 2 * sizeof(Torus);
    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
@@ -1151,15 +1101,9 @@ template <typename Torus> struct int_fullprop_buffer {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-
    //
    // No broadcast is needed because full prop is done on 1 single GPU.
-    // By passing a single-GPU CudaStreams with streams.get_ith(0) the LUT is
-    // not broadcast.
    //
-    lut->generate_and_broadcast_lut(streams.get_ith(0), {0, 1},
-                                    {lut_f_message, lut_f_carry},
-                                    gpu_memory_allocated);

    tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -1277,10 +1221,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      if (total_ciphertexts > 0 ||
          reduce_degrees_for_single_carry_propagation) {
        uint64_t size_tracker = 0;
-        allocated_luts_message_carry = true;
        luts_message_carry = new int_radix_lut<Torus>(
            streams, params, 2, pbs_count, true, size_tracker);
-
+        allocated_luts_message_carry = true;
        uint64_t message_modulus_bits =
            (uint64_t)std::log2(params.message_modulus);
        uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
@@ -1296,9 +1239,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
            streams, upper_bound_num_blocks, size_tracker, true);
      }
    }
-
    if (allocated_luts_message_carry) {
-
      auto message_acc = luts_message_carry->get_lut(0, 0);
      auto carry_acc = luts_message_carry->get_lut(0, 1);

@@ -1310,11 +1251,21 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
        return x / message_modulus;
      };

-      auto active_gpu_count_mc =
-          streams.active_gpu_subset(pbs_count, params.pbs_type);
-      luts_message_carry->generate_and_broadcast_lut(
-          active_gpu_count_mc, {0, 1}, {lut_f_message, lut_f_carry},
-          gpu_memory_allocated);
+      // generate accumulators
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), message_acc,
+          luts_message_carry->get_degree(0),
+          luts_message_carry->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, message_modulus, params.carry_modulus,
+          lut_f_message, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), carry_acc,
+          luts_message_carry->get_degree(1),
+          luts_message_carry->get_max_degree(1), params.glwe_dimension,
+          params.polynomial_size, message_modulus, params.carry_modulus,
+          lut_f_carry, gpu_memory_allocated);
+      auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
+      luts_message_carry->broadcast_lut(active_gpu_count_mc);
    }
  }
  int_sum_ciphertexts_vec_memory(
@@ -1449,6 +1400,10 @@ template <typename Torus> struct int_seq_group_prop_memory {
                            uint32_t group_size, uint32_t big_lwe_size_bytes,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;

    grouping_size = group_size;
    group_resolved_carries = new CudaRadixCiphertextFFI;
@@ -1458,33 +1413,31 @@ template <typename Torus> struct int_seq_group_prop_memory {
        allocate_gpu_memory);

    int num_seq_luts = grouping_size - 1;
+    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
    lut_sequential_algorithm =
        new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
                                 allocate_gpu_memory, size_tracker);
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_indices;
-    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
-
    for (int index = 0; index < num_seq_luts; index++) {
      auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
        return (propa_cum_sum_block >> (index + 1)) & 1;
      };
-      lut_funcs.push_back(f_lut_sequential);
+      auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), seq_lut,
+          lut_sequential_algorithm->get_degree(index),
+          lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
+          gpu_memory_allocated);
      h_seq_lut_indexes[index] = index;
-      lut_indices.push_back(index);
    }
    Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams =
-        streams.active_gpu_subset(num_seq_luts, params.pbs_type);
-    lut_sequential_algorithm->generate_and_broadcast_lut(
-        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
-    // lut_sequential_algorithm->broadcast_lut(active_streams);
+    auto active_streams = streams.active_gpu_subset(num_seq_luts);
+    lut_sequential_algorithm->broadcast_lut(active_streams);
    free(h_seq_lut_indexes);
-  }
-
+  };
  void release(CudaStreams streams) {
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   group_resolved_carries,
@@ -1506,6 +1459,10 @@ template <typename Torus> struct int_hs_group_prop_memory {
                           uint32_t num_groups, uint32_t big_lwe_size_bytes,
                           bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;

    auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
      if (msb == 2) {
@@ -1525,11 +1482,15 @@ template <typename Torus> struct int_hs_group_prop_memory {
    lut_hillis_steele = new int_radix_lut<Torus>(
        streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);

-    auto active_streams =
-        streams.active_gpu_subset(num_groups, params.pbs_type);
-    lut_hillis_steele->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {f_lut_hillis_steele}, gpu_memory_allocated);
-  }
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
+        lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_lut_hillis_steele,
+        gpu_memory_allocated);
+    auto active_streams = streams.active_gpu_subset(num_groups);
+    lut_hillis_steele->broadcast_lut(active_streams);
+  };
  void release(CudaStreams streams) {

    lut_hillis_steele->release(streams);
@@ -1704,8 +1665,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
    luts_array_first_step->broadcast_lut(active_streams);
  };
  void release(CudaStreams streams) {
@@ -1819,6 +1779,112 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
      num_extra_luts = 1;
    }

+    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
+    luts_array_second_step = new int_radix_lut<Torus>(
+        streams, params, num_luts_second_step, num_radix_blocks,
+        allocate_gpu_memory, size_tracker);
+
+    // luts for first group inner propagation
+    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
+      auto f_first_grouping_inner_propagation =
+          [lut_id](Torus propa_cum_sum_block) -> Torus {
+        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
+
+        if (carry != 0) {
+          return 2ull; // Generates Carry
+        } else {
+          return 0ull; // Does not generate carry
+        }
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          luts_array_second_step->get_lut(0, lut_id),
+          luts_array_second_step->get_degree(lut_id),
+          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus,
+          f_first_grouping_inner_propagation, gpu_memory_allocated);
+    }
+
+    auto f_first_grouping_outer_propagation =
+        [num_bits_in_block](Torus block) -> Torus {
+      return (block >> (num_bits_in_block - 1)) & 1;
+    };
+
+    int lut_id = grouping_size - 1;
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        luts_array_second_step->get_lut(0, lut_id),
+        luts_array_second_step->get_degree(lut_id),
+        luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        f_first_grouping_outer_propagation, gpu_memory_allocated);
+
+    // for other groupings inner propagation
+    for (int index = 0; index < grouping_size; index++) {
+      uint32_t lut_id = index + grouping_size;
+
+      auto f_other_groupings_inner_propagation =
+          [index](Torus propa_cum_sum_block) -> Torus {
+        uint64_t mask = (2 << index) - 1;
+        if (propa_cum_sum_block >= (2 << index)) {
+          return 2ull; // Generates
+        } else if ((propa_cum_sum_block & mask) == mask) {
+          return 1ull; // Propagate
+        } else {
+          return 0ull; // Nothing
+        }
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          luts_array_second_step->get_lut(0, lut_id),
+          luts_array_second_step->get_degree(lut_id),
+          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus,
+          f_other_groupings_inner_propagation, gpu_memory_allocated);
+    }
+
+    if (use_sequential_algorithm_to_resolve_group_carries) {
+      for (int index = 0; index < grouping_size - 1; index++) {
+        uint32_t lut_id = index + 2 * grouping_size;
+
+        auto f_group_propagation = [index, block_modulus,
+                                    num_bits_in_block](Torus block) -> Torus {
+          if (block == (block_modulus - 1)) {
+            return 0ull;
+          } else {
+            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
+          }
+        };
+
+        generate_device_accumulator<Torus>(
+            streams.stream(0), streams.gpu_index(0),
+            luts_array_second_step->get_lut(0, lut_id),
+            luts_array_second_step->get_degree(lut_id),
+            luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+            polynomial_size, message_modulus, carry_modulus,
+            f_group_propagation, gpu_memory_allocated);
+      }
+    } else {
+      uint32_t lut_id = 2 * grouping_size;
+      auto f_group_propagation = [block_modulus](Torus block) {
+        if (block == (block_modulus - 1)) {
+          return 2ull;
+        } else {
+          return UINT64_MAX % (block_modulus * 2ull);
+        }
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          luts_array_second_step->get_lut(0, lut_id),
+          luts_array_second_step->get_degree(lut_id),
+          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_group_propagation,
+          gpu_memory_allocated);
+    }
+
    Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);

    for (int index = 0; index < num_radix_blocks; index++) {
@@ -1854,11 +1920,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
      }
    }

-    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
-    luts_array_second_step = new int_radix_lut<Torus>(
-        streams, params, num_luts_second_step, num_radix_blocks,
-        allocate_gpu_memory, size_tracker);
-
    // copy the indexes to the gpu
    Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1869,92 +1930,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
        scalar_array_cum_sum, h_scalar_array_cum_sum,
        num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_ids;
-
-    // luts for first group inner propagation
-    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
-      auto f_first_grouping_inner_propagation =
-          [lut_id](Torus propa_cum_sum_block) -> Torus {
-        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
-
-        if (carry != 0) {
-          return 2ull; // Generates Carry
-        } else {
-          return 0ull; // Does not generate carry
-        }
-      };
-      lut_funcs.push_back(f_first_grouping_inner_propagation);
-      lut_ids.push_back(lut_id);
-    }
-
-    auto f_first_grouping_outer_propagation =
-        [num_bits_in_block](Torus block) -> Torus {
-      return (block >> (num_bits_in_block - 1)) & 1;
-    };
-
-    int lut_id = grouping_size - 1;
-
-    lut_funcs.push_back(f_first_grouping_outer_propagation);
-    lut_ids.push_back(lut_id);
-
-    // for other groupings inner propagation
-    for (int index = 0; index < grouping_size; index++) {
-      uint32_t lut_id = index + grouping_size;
-
-      auto f_other_groupings_inner_propagation =
-          [index](Torus propa_cum_sum_block) -> Torus {
-        uint64_t mask = (2 << index) - 1;
-        if (propa_cum_sum_block >= (2 << index)) {
-          return 2ull; // Generates
-        } else if ((propa_cum_sum_block & mask) == mask) {
-          return 1ull; // Propagate
-        } else {
-          return 0ull; // Nothing
-        }
-      };
-
-      lut_funcs.push_back(f_other_groupings_inner_propagation);
-      lut_ids.push_back(lut_id);
-    }
-
-    if (use_sequential_algorithm_to_resolve_group_carries) {
-      for (int index = 0; index < grouping_size - 1; index++) {
-        uint32_t lut_id = index + 2 * grouping_size;
-
-        auto f_group_propagation = [index, block_modulus,
-                                    num_bits_in_block](Torus block) -> Torus {
-          if (block == (block_modulus - 1)) {
-            return 0ull;
-          } else {
-            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
-          }
-        };
-
-        lut_funcs.push_back(f_group_propagation);
-        lut_ids.push_back(lut_id);
-      }
-    } else {
-      uint32_t lut_id = 2 * grouping_size;
-      auto f_group_propagation = [block_modulus](Torus block) {
-        if (block == (block_modulus - 1)) {
-          return 2ull;
-        } else {
-          return UINT64_MAX % (block_modulus * 2ull);
-        }
-      };
-
-      lut_funcs.push_back(f_group_propagation);
-      lut_ids.push_back(lut_id);
-    }
-
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_array_second_step->generate_and_broadcast_lut(
-        active_streams, lut_ids, lut_funcs, gpu_memory_allocated);
-
-    // luts_array_second_step->broadcast_lut(active_streams);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    luts_array_second_step->broadcast_lut(active_streams);

    if (use_sequential_algorithm_to_resolve_group_carries) {

@@ -1978,8 +1955,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
-    auto new_active_streams = streams.active_gpu_subset(
-        new_num_blocks, luts_array_second_step->params.pbs_type);
+    auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
    // We just need to update the lut indexes so we use false here
    luts_array_second_step->broadcast_lut(new_active_streams, false);

@@ -2042,28 +2018,12 @@ template <typename Torus> struct int_sc_prop_memory {
  uint32_t requested_flag;
  bool gpu_memory_allocated;

-  void setup_message_extract_indices_for_carry_async(CudaStreams streams,
-                                                     uint32_t num_radix_blocks,
-                                                     bool allocate_gpu_memory) {
-    Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
-    for (int index = 0; index < num_radix_blocks + 1; index++) {
-      if (index < num_radix_blocks) {
-        h_lut_indexes[index] = 0;
-      } else {
-        h_lut_indexes[index] = 1;
-      }
-    }
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
-        (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
-  }
-
  int_sc_prop_memory(CudaStreams streams, int_radix_params params,
                     uint32_t num_radix_blocks, uint32_t requested_flag_in,
                     bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
+    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
@@ -2086,6 +2046,24 @@ template <typename Torus> struct int_sc_prop_memory {
        streams, params, num_radix_blocks, grouping_size, num_groups,
        allocate_gpu_memory, size_tracker);

+    //  Step 3 elements
+    int num_luts_message_extract =
+        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
+    lut_message_extract = new int_radix_lut<Torus>(
+        streams, params, num_luts_message_extract, num_radix_blocks + 1,
+        allocate_gpu_memory, size_tracker);
+    // lut for the first block in the first grouping
+    auto f_message_extract = [message_modulus](Torus block) -> Torus {
+      return (block >> 1) % message_modulus;
+    };
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
+        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_message_extract,
+        gpu_memory_allocated);
+
    // This store a single block that with be used to store the overflow or
    // carry results
    output_flag = new CudaRadixCiphertextFFI;
@@ -2136,30 +2114,22 @@ template <typename Torus> struct int_sc_prop_memory {
        return output1 << 3 | output2 << 2;
      };

-      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      lut_overflow_flag_prep->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {f_overflow_fp}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_overflow_flag_prep->get_lut(0, 0),
+          lut_overflow_flag_prep->get_degree(0),
+          lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
+          gpu_memory_allocated);
+
+      auto active_streams = streams.active_gpu_subset(1);
+      lut_overflow_flag_prep->broadcast_lut(active_streams);
    }

-    //  Step 3 elements
-    int num_luts_message_extract =
-        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
-    lut_message_extract = new int_radix_lut<Torus>(
-        streams, params, num_luts_message_extract, num_radix_blocks + 1,
-        allocate_gpu_memory, size_tracker);
-    // lut for the first block in the first grouping
-    auto f_message_extract = [message_modulus](Torus block) -> Torus {
-      return (block >> 1) % message_modulus;
-    };
-
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
-
    // For the final cleanup in case of overflow or carry (it seems that I can)
    // It seems that this lut could be apply together with the other one but for
    // now we won't do it
-    switch (requested_flag) {
-    case outputFlag::FLAG_OVERFLOW: { // Overflow case
+    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
      auto f_overflow_last = [num_radix_blocks,
                              requested_flag_in](Torus block) -> Torus {
        uint32_t position = (num_radix_blocks == 1 &&
@@ -2171,38 +2141,61 @@ template <typename Torus> struct int_sc_prop_memory {
        Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
        if (input_carry == outputFlag::FLAG_OVERFLOW) {
          return does_overflow_if_carry_is_1;
+        } else {
+          return does_overflow_if_carry_is_0;
        }
-        return does_overflow_if_carry_is_0;
      };
-      setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
-                                                    allocate_gpu_memory);

-      lut_message_extract->generate_and_broadcast_lut(
-          active_streams, {0, 1}, {f_message_extract, f_overflow_last},
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_message_extract->get_lut(0, 1),
+          lut_message_extract->get_degree(1),
+          lut_message_extract->get_max_degree(1), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_last,
          gpu_memory_allocated);
-      break;
-    }
-    case outputFlag::FLAG_CARRY: { // Carry case

-      setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
-                                                    allocate_gpu_memory);
+      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
+      for (int index = 0; index < num_radix_blocks + 1; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
+          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
+          streams.gpu_index(0), allocate_gpu_memory);
+    }
+    if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case

      auto f_carry_last = [](Torus block) -> Torus {
        return ((block >> 2) & 1);
      };

-      lut_message_extract->generate_and_broadcast_lut(
-          active_streams, {0, 1}, {f_message_extract, f_carry_last},
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_message_extract->get_lut(0, 1),
+          lut_message_extract->get_degree(1),
+          lut_message_extract->get_max_degree(1), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_carry_last,
          gpu_memory_allocated);
-      break;
-    }
-    default:
-      lut_message_extract->generate_and_broadcast_lut(
-          active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
-      break;
-    }

-    // lut_message_extract->broadcast_lut(active_streams);
+      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
+      for (int index = 0; index < num_radix_blocks + 1; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
+          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
+          streams.gpu_index(0), allocate_gpu_memory);
+    }
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks + 1);
+    lut_message_extract->broadcast_lut(active_streams);
  };

  void release(CudaStreams streams) {
@@ -2398,8 +2391,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
    luts_array_first_step->broadcast_lut(active_streams);
  };

@@ -2410,8 +2402,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
-    auto new_active_streams = streams.active_gpu_subset(
-        new_num_blocks, luts_array_first_step->params.pbs_type);
+    auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
    // We just need to update the lut indexes so we use false here
    luts_array_first_step->broadcast_lut(new_active_streams, false);
  }
@@ -2500,11 +2491,15 @@ template <typename Torus> struct int_borrow_prop_memory {
      return (block >> 1) % message_modulus;
    };

-    active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
+        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_message_extract,
+        gpu_memory_allocated);
+    active_streams = streams.active_gpu_subset(num_radix_blocks);

-    lut_message_extract->generate_and_broadcast_lut(
-        active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
+    lut_message_extract->broadcast_lut(active_streams);

    if (compute_overflow) {
      lut_borrow_flag =
@@ -2515,12 +2510,15 @@ template <typename Torus> struct int_borrow_prop_memory {
        return ((block >> 2) & 1);
      };

-      lut_borrow_flag->generate_and_broadcast_lut(
-          active_streams, {0}, {f_borrow_flag}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
+          lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
+      lut_borrow_flag->broadcast_lut(active_streams);
    }

-    active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    active_streams = streams.active_gpu_subset(num_radix_blocks);
    internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
                                                               2);
  };
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -37,14 +37,16 @@ template <typename Torus> struct int_mul_memory {
      zero_out_predicate_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          zero_out_predicate_lut->get_lut(0, 0),
+          zero_out_predicate_lut->get_degree(0),
+          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          zero_out_predicate_lut_f, gpu_memory_allocated);

-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {zero_out_predicate_lut_f},
-          gpu_memory_allocated);
-
-      // zero_out_predicate_lut->broadcast_lut(active_streams);
+      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      zero_out_predicate_lut->broadcast_lut(active_streams);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -52,7 +54,10 @@ template <typename Torus> struct int_mul_memory {
      return;
    }

+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;

    // 'vector_result_lsb' contains blocks from all possible shifts of
    // radix_lwe_left excluding zero ciphertext blocks
@@ -96,6 +101,18 @@ template <typename Torus> struct int_mul_memory {
      return (x * y) / message_modulus;
    };

+    // generate accumulators
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), lsb_acc,
+        luts_array->get_degree(0), luts_array->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        lut_f_lsb, gpu_memory_allocated);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), msb_acc,
+        luts_array->get_degree(1), luts_array->get_max_degree(1),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        lut_f_msb, gpu_memory_allocated);
+
    // lut_indexes_vec for luts_array should be reinitialized
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
@@ -105,12 +122,8 @@ template <typename Torus> struct int_mul_memory {
          streams.stream(0), streams.gpu_index(0),
          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
          msb_vector_block_count);
-
-    auto active_streams =
-        streams.active_gpu_subset(total_block_count, params.pbs_type);
-    luts_array->generate_and_broadcast_bivariate_lut(
-        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
-
+    auto active_streams = streams.active_gpu_subset(total_block_count);
+    luts_array->broadcast_lut(active_streams);
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, params, num_radix_blocks, 2 * num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -126,11 +126,9 @@ template <typename Torus> struct int_grouped_oprf_memory {
        luts->get_lut_indexes(0, 0), this->h_lut_indexes,
        num_blocks_to_process * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams =
-        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
    luts->broadcast_lut(active_streams);

-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_corrections);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
@@ -6,8 +6,6 @@ void release_radix_ciphertext_async(cudaStream_t const stream,
                                    CudaRadixCiphertextFFI *data,
                                    const bool gpu_memory_allocated);

-void release_cpu_radix_ciphertext_async(CudaRadixCiphertextFFI *data);
-
 void reset_radix_ciphertext_blocks(CudaRadixCiphertextFFI *data,
                                   uint32_t new_num_blocks);

--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -85,11 +85,14 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-
-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
+          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          shift_lut_f, gpu_memory_allocated);
+      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      cur_lut_bivariate->broadcast_lut(active_streams);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -168,10 +171,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
+          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          shift_lut_f, gpu_memory_allocated);
+      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      cur_lut_bivariate->broadcast_lut(active_streams);
+
      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
@@ -212,7 +220,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
                                     uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;

-    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
+    auto active_streams = streams.active_gpu_subset(1);
    // In the arithmetic shift, a PBS has to be applied to the last rotated
    // block twice: once to shift it, once to compute the padding block to be
    // copied onto all blocks to the left of the last rotated block
@@ -261,11 +269,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return shifted | padding;
      };

-      auto active_streams_shift_last =
-          streams.active_gpu_subset(1, params.pbs_type);
-      shift_last_block_lut_univariate->generate_and_broadcast_lut(
-          active_streams_shift_last, {0}, {last_block_lut_f},
-          gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          shift_last_block_lut_univariate->get_lut(0, 0),
+          shift_last_block_lut_univariate->get_degree(0),
+          shift_last_block_lut_univariate->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
+      auto active_streams_shift_last = streams.active_gpu_subset(1);
+      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -283,8 +295,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      return (params.message_modulus - 1) * x_sign_bit;
    };

-    padding_block_lut_univariate->generate_and_broadcast_lut(
-        active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        padding_block_lut_univariate->get_lut(0, 0),
+        padding_block_lut_univariate->get_degree(0),
+        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        padding_block_lut_f, gpu_memory_allocated);
+    // auto active_streams = streams.active_gpu_subset(1);
+    padding_block_lut_univariate->broadcast_lut(active_streams);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -317,11 +336,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return message_of_current_block + carry_of_previous_block;
      };

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          shift_blocks_lut_bivariate->get_lut(0, 0),
+          shift_blocks_lut_bivariate->get_degree(0),
+          shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams_shift_blocks, {0}, {blocks_lut_f},
-          gpu_memory_allocated);
+          streams.active_gpu_subset(num_radix_blocks);
+      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -113,21 +113,27 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
      else
        return current_bit;
    };
-    ;
-    auto active_gpu_count_mux = streams.active_gpu_subset(
-        bits_per_block * num_radix_blocks, params.pbs_type);

-    mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
-                                        gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
+        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
+    auto active_gpu_count_mux =
+        streams.active_gpu_subset(bits_per_block * num_radix_blocks);
+    mux_lut->broadcast_lut(active_gpu_count_mux);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
      return x % params.message_modulus;
    };
-
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
+        cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
    auto active_gpu_count_cleaning =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    cleaning_lut->generate_and_broadcast_lut(
-        active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
+        streams.active_gpu_subset(num_radix_blocks);
+    cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -74,26 +74,44 @@ template <typename Torus> struct int_overflowing_sub_memory {
                                           luts_array, size_tracker,
                                           allocate_gpu_memory, size_tracker);

+    auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
+    auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
+
+    // generate luts (aka accumulators)
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
+        luts_array->get_degree(0), luts_array->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        f_lut_does_block_generate_carry, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_does_block_generate_or_propagate, luts_array->get_degree(1),
+        luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
+        gpu_memory_allocated);
    if (allocate_gpu_memory)
      cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                  luts_array->get_lut_indexes(0, 1), 1,
                                  num_radix_blocks - 1);

-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {f_luts_borrow_propagation_sum},
-        gpu_memory_allocated);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        luts_borrow_propagation_sum->get_lut(0, 0),
+        luts_borrow_propagation_sum->get_degree(0),
+        luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        f_luts_borrow_propagation_sum, gpu_memory_allocated);

-    luts_array->generate_and_broadcast_lut(
-        active_streams, {0, 1},
-        {f_lut_does_block_generate_carry,
-         f_lut_does_block_generate_or_propagate},
-        gpu_memory_allocated);
-    // generate luts (aka accumulators)
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
+        message_acc->get_degree(0), message_acc->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        f_message_acc, gpu_memory_allocated);

-    message_acc->generate_and_broadcast_lut(
-        active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
+    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    luts_array->broadcast_lut(active_streams);
+    luts_borrow_propagation_sum->broadcast_lut(active_streams);
+    message_acc->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_comparison.h
@@ -38,8 +38,7 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    uint32_t num_gpus = active_streams.count();

--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -40,8 +40,7 @@ template <typename Torus> struct int_equality_selectors_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -155,8 +154,7 @@ template <typename Torus> struct int_possible_results_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -209,8 +207,7 @@ template <typename Torus> struct int_possible_results_buffer {
            params.message_modulus, params.carry_modulus, fns,
            allocate_gpu_memory);

-        current_lut->broadcast_lut(
-            streams.active_gpu_subset(1, params.pbs_type));
+        current_lut->broadcast_lut(streams.active_gpu_subset(1));
        stream_luts[lut_count++] = current_lut;
        lut_value_start += luts_in_this_call;
      }
@@ -285,8 +282,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams);
@@ -298,10 +294,13 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
      int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      lut->generate_and_broadcast_lut(
-          streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
-          allocate_gpu_memory);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          id_fn, allocate_gpu_memory);

+      lut->broadcast_lut(streams.active_gpu_subset(num_blocks));
      this->stream_identity_luts[i] = lut;
    }

@@ -314,17 +313,27 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->message_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-
-    this->message_extract_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
-        allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->message_extract_lut->get_lut(0, 0),
+        this->message_extract_lut->get_degree(0),
+        this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        msg_fn, allocate_gpu_memory);
+    this->message_extract_lut->broadcast_lut(
+        streams.active_gpu_subset(num_blocks));

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-
-    this->carry_extract_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
-        allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->carry_extract_lut->get_lut(0, 0),
+        this->carry_extract_lut->get_degree(0),
+        this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        carry_fn, allocate_gpu_memory);
+    this->carry_extract_lut->broadcast_lut(
+        streams.active_gpu_subset(num_blocks));

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -619,8 +628,7 @@ template <typename Torus> struct int_unchecked_contains_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -695,8 +703,7 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1087,8 +1094,7 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1171,9 +1177,14 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1183,9 +1194,13 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    this->cleanup_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
  }

  void release(CudaStreams streams) {
@@ -1277,8 +1292,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1351,9 +1365,14 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1363,9 +1382,13 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    this->cleanup_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
  }

  void release(CudaStreams streams) {
@@ -1439,8 +1462,7 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1501,8 +1523,7 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
+    this->active_streams = streams.active_gpu_subset(num_blocks);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
@@ -5,14 +5,21 @@

 extern "C" {

-void cuda_keyswitch_lwe_ciphertext_vector_64_64(
+void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples);

-void cuda_keyswitch_lwe_ciphertext_vector_64_32(
+void cuda_keyswitch_gemm_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
+    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);
+
+void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_output_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
@@ -24,20 +31,6 @@ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t num_lwes, bool allocate_gpu_memory);

-void cuda_keyswitch_gemm_lwe_ciphertext_vector_64_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);
-
-void cuda_keyswitch_gemm_lwe_ciphertext_vector_64_32(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);
-
 uint64_t scratch_cuda_keyswitch_gemm_64(void *stream, uint32_t gpu_index,
                                        void **ks_tmp_memory,
                                        uint32_t lwe_dimension_in,
@@ -72,10 +65,6 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
                                                uint32_t gpu_index,
                                                int8_t **fp_ks_buffer,
                                                bool gpu_memory_allocated);
-
-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
@@ -1,24 +0,0 @@
-#ifndef TRIVIUM_H
-#define TRIVIUM_H
-
-#include "../integer/integer.h"
-
-extern "C" {
-uint64_t scratch_cuda_trivium_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
-
-void cuda_trivium_generate_keystream_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
-    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
-    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
-
-void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
-}
-
-#endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Enzo Di Maria	f3f16da856	chore(gpu): some tests on erc20	2025-12-11 14:09:32 +01:00
Enzo Di Maria	368b3c1b87	refactor(gpu): creating benchmarks for match_value	2025-12-11 10:31:44 +01:00