fix(gpu): handling temporary events destruction

2026-04-28 03:01:21 -04:00 · 2026-01-26 16:57:49 +01:00
195 changed files with 3666 additions and 11628 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -41,7 +41,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -66,7 +66,7 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'true' # Needed to pull lfs data
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -141,7 +141,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -63,7 +63,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -171,7 +171,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -299,7 +299,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -50,7 +50,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -86,7 +86,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
    timeout-minutes: 1440
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -51,7 +51,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -72,7 +72,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -155,7 +155,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -39,7 +39,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -16,8 +16,7 @@ on:
          - integer_zk
          - shortint
          - shortint_oprf
-          - hlapi_unsigned
-          - hlapi_signed
+          - hlapi
          - hlapi_erc20
          - hlapi_dex
          - hlapi_noise_squash
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -25,6 +25,10 @@ on:
        description: "Generate SVG tables"
        type: boolean
        default: true
+      open-pr:
+        description: "Open a PR with the benchmark results"
+        type: boolean
+        default: false

 permissions: {}

@@ -162,3 +166,54 @@ jobs:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
+
+  open-pr:
+    name: benchmark-documentation/open-pr
+    needs: [ generate-svgs-with-benchmarks-run, generate-svgs-without-benchmarks-run ]
+    if: ${{ always() && inputs.open-pr &&
+      (needs.generate-svgs-with-benchmarks-run.result == 'success' || needs.generate-svgs-without-benchmarks-run.result == 'success') }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write # Needed to create a commit
+      pull-requests: write # Needed to open a pull-request
+    env:
+      PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        with:
+          persist-credentials: 'false'
+
+      - name: Download SVG tables
+        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        with:
+          path: svg_tables
+          merge-multiple: 'true'
+
+      # Perform best effort to copy SVG tables. If the copy fails or files don't exist, the PR will still be created.
+      - name: Copy SVG tables to documentation location
+        run: |
+          cp -f svg_tables/*integer-benchmark*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
+          cp -f svg_tables/*pbs-benchmark-tuniform*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
+          cp -f svg_tables/cpu-gpu-hpu-integer-benchmark-fheuint64-tuniform-2m128-ciphertext.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
+
+      - name: Get current date
+        id: get-date
+        run: |
+          echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"
+
+      - name: Create pull-request
+        uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
+        with:
+          sign-commits: true # Commit will be signed by github-actions bot
+          add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg
+          branch: gh-bot/docs/update-svg-tables-${{ steps.get-date.outputs.date }}
+          commit-message: |
+            chore(docs): update benchmark results for all backends
+
+            Automated documentation update from tfhe-rs CI pipeline.
+          title: |
+            [CI] chore(docs): update benchmark results for all backends
+          body: |
+            Documentation update triggered by GitHub workflow.
+          labels: documentation
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 1440 # 24 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -123,7 +123,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -134,7 +134,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -92,7 +92,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
          git lfs install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          path: tfhe-rs
          persist-credentials: false
@@ -141,7 +141,7 @@ jobs:
          ls

      - name: Checkout fhevm
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          repository: zama-ai/fhevm
          persist-credentials: 'false'
@@ -192,7 +192,7 @@ jobs:
          cargo install sqlx-cli

      - name: Install foundry
-        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10
+        uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730

      - name: Cache cargo
        uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7 # v5.0.2
@@ -299,7 +299,7 @@ jobs:
          path: fhevm/$${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -12,8 +12,7 @@ on:
        default: integer
        options:
          - integer
-          - hlapi_unsigned
-          - hlapi_signed
+          - hlapi
          - hlapi_erc20
      op_flavor:
        description: "Operations set to run"
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -126,7 +126,7 @@ jobs:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -50,7 +50,7 @@ jobs:
      pull-requests: write # Needed to write a comment in a pull-request
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0  # Needed to get commit hash
          persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -305,13 +305,13 @@ jobs:
      REF_NAME: ${{ github.head_ref || github.ref_name }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install recent Python
-        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
        with:
          python-version: '3.12'
          pip-install: -r ci/data_extractor/requirements.txt -r ci/perf_regression/requirements.txt
@@ -383,7 +383,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +55,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +102,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +134,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +55,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +102,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +134,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -39,7 +39,7 @@ jobs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -71,7 +71,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +91,7 @@ jobs:
        browser: [ chrome, firefox ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -181,7 +181,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: zama-ai/slab
          path: slab
@@ -213,7 +213,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -26,7 +26,7 @@ jobs:
    name: cargo_audit/audit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -24,7 +24,7 @@ jobs:
    outputs:
      matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -140,7 +140,7 @@ jobs:
      result: ${{ steps.set_builds_result.outputs.result }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -26,7 +26,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -24,7 +24,7 @@ jobs:
        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -29,7 +29,7 @@ jobs:
      fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -56,7 +56,7 @@ jobs:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +92,7 @@ jobs:
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -31,7 +31,7 @@ jobs:
      ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -60,7 +60,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,7 +87,7 @@ jobs:
        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
      fail-fast: false
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -43,7 +43,7 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@135698455da5c3b3e55f73f4419e481ab68cdd95 # v0.4.1
+        uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -31,7 +31,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -37,7 +37,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -43,7 +43,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'

--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -41,7 +41,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -198,7 +198,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -39,7 +39,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -79,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -151,7 +151,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu_fast
+          make test_high_level_api_gpu

  slack-notify:
    name: gpu_fast_tests/slack-notify
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,7 +68,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -154,7 +154,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu_fast
+          make test_high_level_api_gpu

  slack-notify:
    name: gpu_full_multi_gpu_tests/slack-notify
@@ -187,7 +187,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -65,7 +65,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -74,7 +74,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -32,7 +32,7 @@ jobs:
      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -62,7 +62,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -34,7 +34,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,7 +53,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -41,7 +41,7 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -52,7 +52,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -93,7 +93,7 @@ jobs:
      id-token: write # Needed for OIDC token exchange on crates.io
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: 'false'
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -221,7 +221,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -68,7 +68,7 @@ jobs:
      id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
    steps:
      - name: Checkout
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -92,7 +92,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
+        uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
@@ -109,7 +109,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
+        uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
          toolchain: stable

      - name: Checkout lattice-estimator
-        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/pr_milestone_check.yml
+++ b/.github/workflows/pr_milestone_check.yml
@@ -0,0 +1,67 @@
+name: pr_milestone_check
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+# external contributors workflows are manually approved
+
+jobs:
+  check-empty-milestone:
+    name: pr_milestone_check/check-empty-milestone
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.milestone == null
+    permissions:
+      pull-requests: write # Need write access on pull requests to post comment
+
+    steps:
+      - name: Post Reminder Comment
+        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
+        with:
+          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
+          body: |
+            '### ❌ Milestone Missing
+
+            Please assign a milestone to this pull request. If your PR targets the next version of
+            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
+
+            If your PR targets a patch version for previous releases: consider creating a dedicated
+            milestone e.g. v1.5.1 if it does not exist yet.'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Check Final Status
+        run: |
+          echo "::error::Milestone is missing. This check is failing."
+          exit 1
+
+  check-milestone-open:
+    name: pr_milestone_check/check-milestone-open
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
+    permissions:
+      pull-requests: write # Need write access on pull requests to post comment
+
+    steps:
+      - name: Post Reminder Comment
+        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
+        with:
+          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
+          body: |
+            '### ❌ Milestone is closed
+
+            Please assign an open milestone to this pull request. If your PR targets the next version of
+            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
+
+            If your PR targets a patch version for previous releases: consider creating a dedicated
+            milestone e.g. v1.5.1 if it does not exist yet.'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Check Final Status
+        run: |
+          echo "::error::Milestone is closed. This check is failing."
+          exit 1
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -30,7 +30,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
+          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -47,8 +47,6 @@ jobs:

          echo ">>> Pushing all LFS items..."
          git lfs push --all destination "${DESTINATION_BRANCH}"
-          
-          shred --remove .git/config

      - name: git-sync-tags
        env:
@@ -61,7 +59,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
+          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -72,5 +70,3 @@ jobs:

          echo ">>> Pushing git changes..."
          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
-          
-          shred --remove .git/config
--- a/48
+++ b/48
@@ -733,12 +733,11 @@ test_core_crypto_gpu:
 		--features=gpu -p tfhe -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu: install_cargo_nextest
-	TEST_THREADS=2 \
-	DOCTEST_THREADS=4 \
-		./scripts/integer-tests.sh \
-		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
-		--tfhe-package "tfhe" --all-but-noise
+test_integer_gpu:
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
+		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
+	RUSTFLAGS="$(RUSTFLAGS)" cargo test --doc --profile $(CARGO_PROFILE) \
+		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=4

 .PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
 test_integer_gpu_debug:
@@ -1050,16 +1049,10 @@ test_high_level_api:
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::

-test_high_level_api_gpu_fast: install_cargo_nextest # Run all the GPU tests for high_level_api except test_uniformity for oprf which is too long
+test_high_level_api_gpu: install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-	  -E "test(/high_level_api::.*gpu.*/) and not test(/uniformity/)"
-
-
-test_high_level_api_gpu: install_cargo_nextest # Run all the GPU tests for high_level_api
-	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
-		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-  	-E "test(/high_level_api::.*gpu.*/)"
+		-E "test(/high_level_api::.*gpu.*/)"

 test_list_gpu: install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest list --cargo-profile $(CARGO_PROFILE) \
@@ -1378,9 +1371,6 @@ clippy_bench: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-	  --features=shortint,internal-keycache \
-		-p tfhe-benchmark -- --no-deps -D warnings

 .PHONY: clippy_bench_gpu # Run clippy lints on tfhe-benchmark
 clippy_bench_gpu: install_rs_check_toolchain
@@ -1415,14 +1405,14 @@ bench_signed_integer: install_rs_check_toolchain

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_signed_integer_gpu # Run benchmarks for signed integer on GPU backend
 bench_signed_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1485,13 +1475,6 @@ bench_integer_trivium_gpu: install_rs_check_toolchain
 	--bench integer-trivium \
 	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_integer_kreyvium_gpu # Run benchmarks for kreyvium on GPU backend
-bench_integer_kreyvium_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-kreyvium \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
-
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1672,18 +1655,11 @@ bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_unsafe_coop_firefox

-.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
-bench_hlapi_unsigned: install_rs_check_toolchain
+.PHONY: bench_hlapi # Run benchmarks for integer operations
+bench_hlapi: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi_unsigned \
-	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
-
-.PHONY: bench_hlapi_signed # Run benchmarks for signed integer operations
-bench_hlapi_signed: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi_signed \
+	--bench hlapi \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -87,7 +87,6 @@ fn main() {
            "cuda/include/integer/rerand.h",
            "cuda/include/aes/aes.h",
            "cuda/include/trivium/trivium.h",
-            "cuda/include/kreyvium/kreyvium.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -29,13 +29,15 @@ template <typename Torus> struct int_aes_lut_buffers {
        allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return a & b; };
-
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
    auto active_streams_and_lut = streams.active_gpu_subset(
        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
        params.pbs_type);
-    this->and_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
-
+    this->and_lut->broadcast_lut(active_streams_and_lut);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->flush_lut = new int_radix_lut<Torus>(
@@ -44,11 +46,14 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
      return x & 1;
    };
-
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
    auto active_streams_flush_lut = streams.active_gpu_subset(
        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
-    this->flush_lut->generate_and_broadcast_lut(
-        active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
+    this->flush_lut->broadcast_lut(active_streams_flush_lut);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->carry_lut = new int_radix_lut<Torus>(
@@ -56,11 +61,14 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
      return (x >> 1) & 1;
    };
-
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
+        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_lambda, allocate_gpu_memory);
    auto active_streams_carry_lut =
        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
-    this->carry_lut->generate_and_broadcast_lut(
-        active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
+    this->carry_lut->broadcast_lut(active_streams_carry_lut);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -11,6 +11,10 @@ extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
 extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

+extern "C" {
+int32_t cuda_setup_multi_gpu(int device_0_id);
+}
+
 // Define a variant type that can be either a vector or a single pointer
 template <typename Torus>
 using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -45,9 +45,12 @@ template <typename Torus> struct boolean_bitop_buffer {

        // BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
        // only lut for degree = 1 is generated
-        lut->generate_and_broadcast_bivariate_lut(active_streams, {0},
-                                                  {lut_bivariate_f},
-                                                  gpu_memory_allocated, {}, 2);
+        generate_device_accumulator_bivariate_with_factor<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
      }
      break;
    default:
@@ -62,8 +65,14 @@ template <typename Torus> struct boolean_bitop_buffer {
        return x % params.message_modulus;
      };

-      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
+      message_extract_lut->broadcast_lut(active_streams);
    }
    tmp_lwe_left = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -133,8 +142,12 @@ template <typename Torus> struct int_bitop_buffer {
          }
        };

-        lut->generate_and_broadcast_bivariate_lut(
-            active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
+        generate_device_accumulator_bivariate<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
      }
      break;
    default:
@@ -143,8 +156,6 @@ template <typename Torus> struct int_bitop_buffer {
                                     num_radix_blocks, allocate_gpu_memory,
                                     size_tracker);

-      std::vector<std::function<Torus(Torus)>> lut_funcs;
-      std::vector<uint32_t> lut_indices;
      for (int i = 0; i < params.message_modulus; i++) {
        auto rhs = i;

@@ -160,13 +171,14 @@ template <typename Torus> struct int_bitop_buffer {
            return x ^ rhs;
          }
        };
-
-        lut_funcs.push_back(lut_univariate_scalar_f);
-        lut_indices.push_back(i);
+        generate_device_accumulator<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
+            lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_univariate_scalar_f,
+            gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
      }
-
-      lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
-                                      gpu_memory_allocated);
    }
  }

@@ -199,11 +211,16 @@ template <typename Torus> struct boolean_bitnot_buffer {
        return x % message_modulus;
      };

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
-
-      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      message_extract_lut->broadcast_lut(active_streams);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -28,17 +28,21 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
      uint32_t bits_per_block = std::log2(params.message_modulus);
      uint32_t msg_modulus = params.message_modulus;

-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-
-      lut->generate_and_broadcast_lut(
-          active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          [msg_modulus, bits_per_block](Torus x) {
            const auto xm = x % msg_modulus;
            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
            return (Torus)((msg_modulus - 1) * sign_bit);
-          }},
+          },
          allocate_gpu_memory);

+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+      lut->broadcast_lut(active_streams);
+
      this->last_block = new CudaRadixCiphertextFFI;

      create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -85,6 +85,24 @@ template <typename Torus> struct int_cmux_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
+        predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
+        predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
+        message_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        message_extract_lut_f, gpu_memory_allocated);
    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
    for (int index = 0; index < 2 * num_radix_blocks; index++) {
      if (index < num_radix_blocks) {
@@ -97,18 +115,12 @@ template <typename Torus> struct int_cmux_buffer {
        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-
    auto active_streams_pred =
        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
-    predicate_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
-        gpu_memory_allocated);
-
+    predicate_lut->broadcast_lut(active_streams_pred);
    auto active_streams_msg =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-
-    message_extract_lut->generate_and_broadcast_lut(
-        active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
+    message_extract_lut->broadcast_lut(active_streams_msg);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -39,21 +39,22 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

-    preallocated_h_lut = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
-
    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
                                            allocate_gpu_memory, size_tracker);
-
-    auto active_streams =
-        streams.active_gpu_subset(max_chunks, params.pbs_type);
-
    auto is_max_value_f = [max_value](Torus x) -> Torus {
      return x == max_value;
    };
+    preallocated_h_lut = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
+        is_max_value->get_degree(0), is_max_value->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_max_value_f, gpu_memory_allocated);

-    is_max_value->generate_and_broadcast_lut(
-        active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
+    auto active_streams =
+        streams.active_gpu_subset(max_chunks, params.pbs_type);
+    is_max_value->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
@@ -102,10 +103,15 @@ template <typename Torus> struct int_comparison_eq_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
+        is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    is_non_zero_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);
+    is_non_zero_lut->broadcast_lut(active_streams);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -123,27 +129,32 @@ template <typename Torus> struct int_comparison_eq_buffer {
        return (lhs == rhs);
      }
    };
-
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < total_modulus; i++) {
      auto lut_f = [i, operator_f](Torus x) -> Torus {
        return operator_f(i, x);
      };
-      lut_funcs.push_back(lut_f);
-      lut_indices.push_back(i);
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          scalar_comparison_luts->get_lut(0, i),
+          scalar_comparison_luts->get_degree(i),
+          scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f, gpu_memory_allocated);
    }
-
-    scalar_comparison_luts->generate_and_broadcast_lut(
-        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
-
+    scalar_comparison_luts->broadcast_lut(active_streams);
    if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
      operator_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-      operator_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {operator_f}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
+          operator_lut->get_degree(0), operator_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, operator_f, gpu_memory_allocated);
+
+      operator_lut->broadcast_lut(active_streams);
    } else {
      operator_lut = nullptr;
    }
@@ -210,6 +221,9 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    // LUTs
+    tree_inner_leaf_lut =
+        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
+                                 allocate_gpu_memory, size_tracker);

    tree_last_leaf_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -220,14 +234,15 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    tree_inner_leaf_lut =
-        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
-                                 allocate_gpu_memory, size_tracker);
-
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
+        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        block_selector_f, gpu_memory_allocated);
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
+    tree_inner_leaf_lut->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
@@ -411,8 +426,12 @@ template <typename Torus> struct int_comparison_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    identity_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
+        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
+    identity_lut->broadcast_lut(active_streams);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -422,8 +441,13 @@ template <typename Torus> struct int_comparison_buffer {
    is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                           allocate_gpu_memory, size_tracker);

-    is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
-                                            gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
+        is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_zero_f, gpu_memory_allocated);
+
+    is_zero_lut->broadcast_lut(active_streams);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -498,9 +522,13 @@ template <typename Torus> struct int_comparison_buffer {
        PANIC("Cuda error: sign_lut creation failed due to wrong function.")
      };

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
+          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      signed_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
+      signed_lut->broadcast_lut(active_streams);
    }
    preallocated_h_lut = (Torus *)malloc(
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -11,26 +11,16 @@ template <typename Torus> struct int_compression {
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
  uint32_t lwe_per_glwe;
-  uint32_t max_num_glwes;

-  // num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
-  // compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
-  // defined by the chosen parameter set
  int_compression(CudaStreams streams, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
-    this->lwe_per_glwe = lwe_per_glwe;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;

-    // Calculate the actual number of GLWEs needed based on total radix blocks.
-    // This ensures we allocate enough memory when num_radix_blocks >
-    // lwe_per_glwe.
-    max_num_glwes = (num_radix_blocks + lwe_per_glwe - 1) / lwe_per_glwe;
-
    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
            sizeof(Torus),
@@ -38,7 +28,7 @@ template <typename Torus> struct int_compression {
        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            max_num_glwes * glwe_accumulator_size * sizeof(Torus),
+            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory));

@@ -116,14 +106,19 @@ template <typename Torus> struct int_decompression {
          encryption_params.carry_modulus;
      auto effective_compression_carry_modulus = 1;

-      auto active_streams = streams.active_gpu_subset(
-          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
-      decompression_rescale_lut->generate_and_broadcast_lut_with_encoding(
-          active_streams, {0}, {decompression_rescale_f},
+      generate_device_accumulator_with_encoding<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          decompression_rescale_lut->get_lut(0, 0),
+          decompression_rescale_lut->get_degree(0),
+          decompression_rescale_lut->get_max_degree(0),
+          encryption_params.glwe_dimension, encryption_params.polynomial_size,
          effective_compression_message_modulus,
          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
-          gpu_memory_allocated);
+          decompression_rescale_f, gpu_memory_allocated);
+      auto active_streams = streams.active_gpu_subset(
+          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
+      decompression_rescale_lut->broadcast_lut(active_streams);
    }
  }
  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -283,9 +283,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                     zero_out_if_not_1_lut_2};
    size_t lut_gpu_indexes[2] = {0, 3};
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
-                                          {0}, {zero_out_if_not_1_lut_f},
-                                          gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(lut_gpu_indexes[j]),
+          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
    }

    luts[0] = zero_out_if_not_2_lut_1;
@@ -293,9 +296,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    lut_gpu_indexes[0] = 1;
    lut_gpu_indexes[1] = 2;
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
-                                          {0}, {zero_out_if_not_2_lut_f},
-                                          gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(lut_gpu_indexes[j]),
+          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
    }

    quotient_lut_1 =
@@ -315,12 +321,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    };
    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };

-    quotient_lut_1->generate_and_broadcast_lut(
-        streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
-    quotient_lut_2->generate_and_broadcast_lut(
-        streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
-    quotient_lut_3->generate_and_broadcast_lut(
-        streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
+        quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
+        quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
+        quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);

    message_extract_lut_1 = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -335,12 +350,15 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    luts[0] = message_extract_lut_1;
    luts[1] = message_extract_lut_2;

-    auto active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
-
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
+      auto active_streams =
+          streams.active_gpu_subset(num_blocks, params.pbs_type);
+      luts[j]->broadcast_lut(active_streams);
    }
  }

@@ -989,14 +1007,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      masking_luts_2[i] = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
+          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
-      masking_luts_1[i]->generate_and_broadcast_lut(
-          active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);
+      masking_luts_1[i]->broadcast_lut(active_streams_1);

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
+          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
      auto active_streams_2 =
          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      masking_luts_2[i]->generate_and_broadcast_lut(
-          active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
+      masking_luts_2[i]->broadcast_lut(active_streams_2);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1014,12 +1042,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
-
    auto active_streams =
        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
+      luts[j]->broadcast_lut(active_streams);
    }

    // Give name to closures to improve readability
@@ -1045,14 +1076,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    zero_out_if_overflow_did_not_happen[0]
-        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
-                                               gpu_memory_allocated, {},
-                                               params.message_modulus - 2);
-    zero_out_if_overflow_did_not_happen[1]
-        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
-                                               gpu_memory_allocated, {},
-                                               params.message_modulus - 1);
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        zero_out_if_overflow_did_not_happen[0]->get_lut(0, 0),
+        zero_out_if_overflow_did_not_happen[0]->get_degree(0),
+        zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, cur_lut_f, params.message_modulus - 2,
+        gpu_memory_allocated);
+    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(active_streams);
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
+        zero_out_if_overflow_did_not_happen[1]->get_degree(0),
+        zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, cur_lut_f, params.message_modulus - 1,
+        gpu_memory_allocated);
+    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(active_streams);

    // create and generate zero_out_if_overflow_happened
    zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -1069,12 +1110,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    zero_out_if_overflow_happened[0]->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {overflow_happened_f}, gpu_memory_allocated, {},
-        params.message_modulus - 2);
-    zero_out_if_overflow_happened[1]->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {overflow_happened_f}, gpu_memory_allocated, {},
-        params.message_modulus - 1);
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        zero_out_if_overflow_happened[0]->get_lut(0, 0),
+        zero_out_if_overflow_happened[0]->get_degree(0),
+        zero_out_if_overflow_happened[0]->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
+        gpu_memory_allocated);
+    zero_out_if_overflow_happened[0]->broadcast_lut(active_streams);
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        zero_out_if_overflow_happened[1]->get_lut(0, 0),
+        zero_out_if_overflow_happened[1]->get_degree(0),
+        zero_out_if_overflow_happened[1]->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
+        gpu_memory_allocated);
+    zero_out_if_overflow_happened[1]->broadcast_lut(active_streams);

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
@@ -1088,8 +1141,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
-          active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          merge_overflow_flags_luts[i]->get_lut(0, 0),
+          merge_overflow_flags_luts[i]->get_degree(0),
+          merge_overflow_flags_luts[i]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_bit, gpu_memory_allocated);
+      merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
    }
  }

@@ -1498,12 +1557,16 @@ template <typename Torus> struct int_div_rem_memory {
      compare_signed_bits_lut = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          compare_signed_bits_lut->get_lut(0, 0),
+          compare_signed_bits_lut->get_degree(0),
+          compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
-
-      compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
-          active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
-          gpu_memory_allocated);
+      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -53,8 +53,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return count;
    };

-    univ_lut_mem->generate_and_broadcast_lut(
-        active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
+        univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
+
+    univ_lut_mem->broadcast_lut(active_streams);

    auto generate_bi_lut_lambda =
        [num_bits](Torus block_num_bit_count,
@@ -65,8 +70,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return 0;
    };

-    biv_lut_mem->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
+        biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
+
+    biv_lut_mem->broadcast_lut(active_streams);

    this->tmp_ct = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -222,7 +232,7 @@ template <typename Torus> struct int_ilog2_buffer {
        this->sum_output_not_propagated, counter_num_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

-    lut_message_not =
+    this->lut_message_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
                                 allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus)> lut_message_lambda =
@@ -230,11 +240,16 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t message = x % this->params.message_modulus;
      return (~message) % this->params.message_modulus;
    };
-
+    generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
+                                this->lut_message_not->get_lut(0, 0),
+                                this->lut_message_not->get_degree(0),
+                                this->lut_message_not->get_max_degree(0),
+                                params.glwe_dimension, params.polynomial_size,
+                                params.message_modulus, params.carry_modulus,
+                                lut_message_lambda, allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
-    lut_message_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);
+    lut_message_not->broadcast_lut(active_streams);

    this->lut_carry_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -244,8 +259,13 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t carry = x / this->params.message_modulus;
      return (~carry) % this->params.message_modulus;
    };
-    lut_carry_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0),
+        this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
+        this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_carry_lambda, allocate_gpu_memory);
+    lut_carry_not->broadcast_lut(active_streams);

    this->message_blocks_not = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -37,12 +37,17 @@ template <typename Torus> struct int_mul_memory {
      zero_out_predicate_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          zero_out_predicate_lut->get_lut(0, 0),
+          zero_out_predicate_lut->get_degree(0),
+          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          zero_out_predicate_lut_f, gpu_memory_allocated);

      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {zero_out_predicate_lut_f},
-          gpu_memory_allocated);
+      zero_out_predicate_lut->broadcast_lut(active_streams);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -50,7 +55,10 @@ template <typename Torus> struct int_mul_memory {
      return;
    }

+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;

    // 'vector_result_lsb' contains blocks from all possible shifts of
    // radix_lwe_left excluding zero ciphertext blocks
@@ -83,6 +91,8 @@ template <typename Torus> struct int_mul_memory {
    // luts_array -> lut = {lsb_acc, msb_acc}
    luts_array = new int_radix_lut<Torus>(streams, params, 2, total_block_count,
                                          allocate_gpu_memory, size_tracker);
+    auto lsb_acc = luts_array->get_lut(0, 0);
+    auto msb_acc = luts_array->get_lut(0, 1);

    // define functions for each accumulator
    auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
@@ -92,6 +102,18 @@ template <typename Torus> struct int_mul_memory {
      return (x * y) / message_modulus;
    };

+    // generate accumulators
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), lsb_acc,
+        luts_array->get_degree(0), luts_array->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        lut_f_lsb, gpu_memory_allocated);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), msb_acc,
+        luts_array->get_degree(1), luts_array->get_max_degree(1),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        lut_f_msb, gpu_memory_allocated);
+
    // lut_indexes_vec for luts_array should be reinitialized
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
@@ -101,12 +123,9 @@ template <typename Torus> struct int_mul_memory {
          streams.stream(0), streams.gpu_index(0),
          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
          msb_vector_block_count);
-
    auto active_streams =
        streams.active_gpu_subset(total_block_count, params.pbs_type);
-    luts_array->generate_and_broadcast_bivariate_lut(
-        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
-
+    luts_array->broadcast_lut(active_streams);
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, params, num_radix_blocks, 2 * num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -53,10 +53,6 @@ template <typename Torus> struct int_grouped_oprf_memory {

    // Pre-generate all possible LUTs.
    //
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_indices;
-    std::vector<uint64_t> lut_degrees;
-
    for (uint32_t random_bit = 1; random_bit <= message_bits_per_block;
         ++random_bit) {
      uint64_t p = 1ULL << random_bit;
@@ -74,13 +70,14 @@ template <typename Torus> struct int_grouped_oprf_memory {

      uint64_t degree = 0;
      uint32_t lut_index = random_bit - 1;
-
-      lut_funcs.push_back(lut_f);
-      lut_indices.push_back(lut_index);
-
+      generate_device_accumulator_no_encoding<Torus>(
+          streams.stream(0), streams.gpu_index(0), luts->get_lut(0, lut_index),
+          degree, params.message_modulus, params.carry_modulus,
+          params.glwe_dimension, params.polynomial_size, lut_f,
+          allocate_gpu_memory);
      // In  OPRF the degree is hard set to p - 1 instead of the LUT degree
      degree = p - 1;
-      lut_degrees.push_back(degree);
+      *luts->get_degree(lut_index) = degree;
    }

    // For each block, this loop determines the exact number of bits to generate
@@ -131,16 +128,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
-
-    // No encoding for these LUTS. Generate LUT also sets LUT degrees to default
-    // values
-    luts->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
-                                     allocate_gpu_memory, false);
-
-    // OPRF requires custom LUT degrees
-    for (uint32_t i = 0; i < lut_degrees.size(); ++i) {
-      *luts->get_degree(i) = lut_degrees[i];
-    }
+    luts->broadcast_lut(active_streams);

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_corrections);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -85,11 +85,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
+          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          shift_lut_f, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
+      cur_lut_bivariate->broadcast_lut(active_streams);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -168,10 +172,16 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
+          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          shift_lut_f, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
+      cur_lut_bivariate->broadcast_lut(active_streams);
+
      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
@@ -261,11 +271,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return shifted | padding;
      };

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          shift_last_block_lut_univariate->get_lut(0, 0),
+          shift_last_block_lut_univariate->get_degree(0),
+          shift_last_block_lut_univariate->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
      auto active_streams_shift_last =
          streams.active_gpu_subset(1, params.pbs_type);
-      shift_last_block_lut_univariate->generate_and_broadcast_lut(
-          active_streams_shift_last, {0}, {last_block_lut_f},
-          gpu_memory_allocated);
+      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -283,8 +298,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      return (params.message_modulus - 1) * x_sign_bit;
    };

-    padding_block_lut_univariate->generate_and_broadcast_lut(
-        active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        padding_block_lut_univariate->get_lut(0, 0),
+        padding_block_lut_univariate->get_degree(0),
+        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        padding_block_lut_f, gpu_memory_allocated);
+    // auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
+    padding_block_lut_univariate->broadcast_lut(active_streams);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -317,11 +339,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return message_of_current_block + carry_of_previous_block;
      };

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          shift_blocks_lut_bivariate->get_lut(0, 0),
+          shift_blocks_lut_bivariate->get_degree(0),
+          shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams_shift_blocks, {0}, {blocks_lut_f},
-          gpu_memory_allocated);
+      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -113,20 +113,27 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
      else
        return current_bit;
    };
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
+        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
    auto active_gpu_count_mux = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);
-
-    mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
-                                        gpu_memory_allocated);
+    mux_lut->broadcast_lut(active_gpu_count_mux);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
      return x % params.message_modulus;
    };
-
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
+        cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
    auto active_gpu_count_cleaning =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    cleaning_lut->generate_and_broadcast_lut(
-        active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
+    cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -74,26 +74,45 @@ template <typename Torus> struct int_overflowing_sub_memory {
                                           luts_array, size_tracker,
                                           allocate_gpu_memory, size_tracker);

+    auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
+    auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
+
+    // generate luts (aka accumulators)
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
+        luts_array->get_degree(0), luts_array->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        f_lut_does_block_generate_carry, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_does_block_generate_or_propagate, luts_array->get_degree(1),
+        luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
+        gpu_memory_allocated);
    if (allocate_gpu_memory)
      cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                  luts_array->get_lut_indexes(0, 1), 1,
                                  num_radix_blocks - 1);

+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        luts_borrow_propagation_sum->get_lut(0, 0),
+        luts_borrow_propagation_sum->get_degree(0),
+        luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        f_luts_borrow_propagation_sum, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
+        message_acc->get_degree(0), message_acc->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        f_message_acc, gpu_memory_allocated);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {f_luts_borrow_propagation_sum},
-        gpu_memory_allocated);
-
-    luts_array->generate_and_broadcast_lut(
-        active_streams, {0, 1},
-        {f_lut_does_block_generate_carry,
-         f_lut_does_block_generate_or_propagate},
-        gpu_memory_allocated);
-    // generate luts (aka accumulators)
-
-    message_acc->generate_and_broadcast_lut(
-        active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
+    luts_array->broadcast_lut(active_streams);
+    luts_borrow_propagation_sum->broadcast_lut(active_streams);
+    message_acc->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -7,8 +7,7 @@
 #include <functional>
 #include <vector>

-// If we use more than 5 streams the result is incorrect
-const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;
+const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;

 template <typename Torus> struct int_equality_selectors_buffer {
  int_radix_params params;
@@ -61,10 +60,18 @@ template <typename Torus> struct int_equality_selectors_buffer {
      fns.push_back([i](Torus x) -> Torus { return (x == i); });
    }

-    this->comparison_luts->generate_and_broadcast_many_lut(
-        active_streams, {0}, {fns}, allocate_gpu_memory);
+    generate_many_lut_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->comparison_luts->get_lut(0, 0),
+        this->comparison_luts->get_degree(0),
+        this->comparison_luts->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        fns, allocate_gpu_memory);
+
    fns.clear();

+    this->comparison_luts->broadcast_lut(active_streams);
+
    this->tmp_many_luts_output = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), this->tmp_many_luts_output,
@@ -195,10 +202,15 @@ template <typename Torus> struct int_possible_results_buffer {
          fns.push_back([c](Torus x) -> Torus { return (x == 1) * c; });
        }

-        current_lut->generate_and_broadcast_many_lut(
-            streams.active_gpu_subset(1, params.pbs_type), {0}, {fns},
+        generate_many_lut_device_accumulator<Torus>(
+            streams.stream(0), streams.gpu_index(0), current_lut->get_lut(0, 0),
+            current_lut->get_degree(0), current_lut->get_max_degree(0),
+            params.glwe_dimension, params.polynomial_size,
+            params.message_modulus, params.carry_modulus, fns,
            allocate_gpu_memory);

+        current_lut->broadcast_lut(
+            streams.active_gpu_subset(1, params.pbs_type));
        stream_luts[lut_count++] = current_lut;
        lut_value_start += luts_in_this_call;
      }
@@ -286,10 +298,14 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
      int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      lut->generate_and_broadcast_lut(
-          streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
-          allocate_gpu_memory);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          id_fn, allocate_gpu_memory);

+      lut->broadcast_lut(
+          streams.active_gpu_subset(num_blocks, params.pbs_type));
      this->stream_identity_luts[i] = lut;
    }

@@ -302,17 +318,27 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->message_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-
-    this->message_extract_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
-        allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->message_extract_lut->get_lut(0, 0),
+        this->message_extract_lut->get_degree(0),
+        this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        msg_fn, allocate_gpu_memory);
+    this->message_extract_lut->broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-
-    this->carry_extract_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
-        allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->carry_extract_lut->get_lut(0, 0),
+        this->carry_extract_lut->get_degree(0),
+        this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        carry_fn, allocate_gpu_memory);
+    this->carry_extract_lut->broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -1159,9 +1185,15 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1171,9 +1203,14 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    this->cleanup_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
@@ -1339,9 +1376,15 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1351,9 +1394,14 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    this->cleanup_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
@@ -1,24 +0,0 @@
-#ifndef KREYVIUM_H
-#define KREYVIUM_H
-
-#include "../integer/integer.h"
-
-extern "C" {
-uint64_t scratch_cuda_kreyvium_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
-
-void cuda_kreyvium_generate_keystream_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
-    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
-    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
-
-void cleanup_cuda_kreyvium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium_utilities.h
@@ -1,320 +0,0 @@
-#ifndef KREYVIUM_UTILITIES_H
-#define KREYVIUM_UTILITIES_H
-#include "../integer/integer_utilities.h"
-
-// Kreyvium specific constants
-// The batch size is set to 64 to allow efficient parallel processing of 64
-// steps at once.
-constexpr uint32_t KREYVIUM_BATCH_SIZE = 64;
-
-// In each Kreyvium step, there are exactly 3 non-linear AND operations:
-// 1. (c109 & c108)
-// 2. (a91 & a90)
-// 3. (b82 & b81)
-constexpr uint32_t KREYVIUM_NUM_AND_GATES = 3;
-
-// In each Kreyvium step, there are 4 paths that require a "flush"
-// to noise-cancel and extract the bit:
-// 1. New bit for Register A
-// 2. New bit for Register B
-// 3. New bit for Register C
-// 4. The Output Keystream bit
-constexpr uint32_t KREYVIUM_NUM_FLUSH_PATHS = 4;
-
-/// Struct to hold the LUTs.
-template <typename Torus> struct int_kreyvium_lut_buffers {
-  // Bivariate AND Gate LUT:
-  // AND operation: f(a, b) = (a & 1) & (b & 1).
-  // This is a Bivariate PBS used for the non-linear parts of Kreyvium.
-  int_radix_lut<Torus> *and_lut;
-
-  // Univariate Flush/Identity LUT:
-  // MESSAGE EXTRACTION operation: f(x) = x & 1.
-  // This is a Univariate PBS used to "flush" the state (reset noise/carries).
-  int_radix_lut<Torus> *flush_lut;
-
-  int_kreyvium_lut_buffers(CudaStreams streams, const int_radix_params &params,
-                           bool allocate_gpu_memory, uint32_t num_inputs,
-                           uint64_t &size_tracker) {
-
-    uint32_t and_ops =
-        num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_AND_GATES;
-    uint32_t flush_ops =
-        num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_FLUSH_PATHS;
-
-    this->and_lut = new int_radix_lut<Torus>(streams, params, 1, and_ops,
-                                             allocate_gpu_memory, size_tracker);
-
-    std::function<Torus(Torus, Torus)> and_lambda =
-        [](Torus lhs, Torus rhs) -> Torus { return (lhs & 1) & (rhs & 1); };
-
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
-        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, and_lambda, allocate_gpu_memory);
-
-    auto active_streams_and =
-        streams.active_gpu_subset(and_ops, params.pbs_type);
-    this->and_lut->broadcast_lut(active_streams_and);
-    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
-
-    this->flush_lut = new int_radix_lut<Torus>(
-        streams, params, 1, flush_ops, allocate_gpu_memory, size_tracker);
-
-    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
-      return x & 1;
-    };
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
-        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, flush_lambda, allocate_gpu_memory);
-
-    auto active_streams_flush =
-        streams.active_gpu_subset(flush_ops, params.pbs_type);
-    this->flush_lut->broadcast_lut(active_streams_flush);
-    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
-  }
-
-  void release(CudaStreams streams) {
-    this->and_lut->release(streams);
-    delete this->and_lut;
-    this->and_lut = nullptr;
-
-    this->flush_lut->release(streams);
-    delete this->flush_lut;
-    this->flush_lut = nullptr;
-
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-  }
-};
-
-/// Struct to hold the Kreyvium internal state and temporary workspaces.
-template <typename Torus> struct int_kreyvium_state_workspaces {
-
-  CudaRadixCiphertextFFI *a_reg;
-  CudaRadixCiphertextFFI *b_reg;
-  CudaRadixCiphertextFFI *c_reg;
-  CudaRadixCiphertextFFI *k_reg;
-  CudaRadixCiphertextFFI *iv_reg;
-
-  // Shift Workspace
-  CudaRadixCiphertextFFI *shift_workspace;
-
-  // Temporary Update Buffers
-  CudaRadixCiphertextFFI *temp_a;
-  CudaRadixCiphertextFFI *temp_b;
-  CudaRadixCiphertextFFI *temp_c;
-
-  CudaRadixCiphertextFFI *packed_and_lhs;
-  CudaRadixCiphertextFFI *packed_and_rhs;
-  CudaRadixCiphertextFFI *packed_and_out;
-
-  // Flush/Cleanup Packing Buffers
-  CudaRadixCiphertextFFI *packed_flush_in;
-  CudaRadixCiphertextFFI *packed_flush_out;
-
-  uint32_t max_batch_blocks;
-  uint32_t k_offset;
-  uint32_t iv_offset;
-
-  int_kreyvium_state_workspaces(CudaStreams streams,
-                                const int_radix_params &params,
-                                bool allocate_gpu_memory, uint32_t num_inputs,
-                                uint64_t &size_tracker) {
-
-    uint32_t batch_blocks = KREYVIUM_BATCH_SIZE * num_inputs;
-    this->max_batch_blocks = batch_blocks;
-    this->k_offset = 0;
-    this->iv_offset = 0;
-
-    this->a_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->b_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->c_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->k_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->k_reg, 128 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->iv_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->iv_reg, 128 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->shift_workspace = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->shift_workspace,
-        128 * num_inputs, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->temp_a = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->temp_a, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->temp_b = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->temp_b, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->temp_c = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->temp_c, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->packed_and_lhs = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_and_lhs,
-        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
-
-    this->packed_and_rhs = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_and_rhs,
-        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
-
-    this->packed_and_out = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_and_out,
-        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
-
-    this->packed_flush_in = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
-        KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
-
-    this->packed_flush_out = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
-        KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
-        size_tracker, allocate_gpu_memory);
-  }
-
-  void release(CudaStreams streams, bool allocate_gpu_memory) {
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->a_reg, allocate_gpu_memory);
-    delete this->a_reg;
-    this->a_reg = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->b_reg, allocate_gpu_memory);
-    delete this->b_reg;
-    this->b_reg = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->c_reg, allocate_gpu_memory);
-    delete this->c_reg;
-    this->c_reg = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->k_reg, allocate_gpu_memory);
-    delete this->k_reg;
-    this->k_reg = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->iv_reg, allocate_gpu_memory);
-    delete this->iv_reg;
-    this->iv_reg = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->shift_workspace, allocate_gpu_memory);
-    delete this->shift_workspace;
-    this->shift_workspace = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->temp_a, allocate_gpu_memory);
-    delete this->temp_a;
-    this->temp_a = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->temp_b, allocate_gpu_memory);
-    delete this->temp_b;
-    this->temp_b = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->temp_c, allocate_gpu_memory);
-    delete this->temp_c;
-    this->temp_c = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_and_lhs, allocate_gpu_memory);
-    delete this->packed_and_lhs;
-    this->packed_and_lhs = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_and_rhs, allocate_gpu_memory);
-    delete this->packed_and_rhs;
-    this->packed_and_rhs = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_and_out, allocate_gpu_memory);
-    delete this->packed_and_out;
-    this->packed_and_out = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_flush_in, allocate_gpu_memory);
-    delete this->packed_flush_in;
-    this->packed_flush_in = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_flush_out, allocate_gpu_memory);
-    delete this->packed_flush_out;
-    this->packed_flush_out = nullptr;
-
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-  }
-};
-
-template <typename Torus> struct int_kreyvium_buffer {
-  int_radix_params params;
-  bool allocate_gpu_memory;
-  uint32_t num_inputs;
-
-  int_kreyvium_lut_buffers<Torus> *luts;
-  int_kreyvium_state_workspaces<Torus> *state;
-
-  int_kreyvium_buffer(CudaStreams streams, const int_radix_params &params,
-                      bool allocate_gpu_memory, uint32_t num_inputs,
-                      uint64_t &size_tracker) {
-    this->params = params;
-    this->allocate_gpu_memory = allocate_gpu_memory;
-    this->num_inputs = num_inputs;
-
-    this->luts = new int_kreyvium_lut_buffers<Torus>(
-        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
-
-    this->state = new int_kreyvium_state_workspaces<Torus>(
-        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
-  }
-
-  void release(CudaStreams streams) {
-    luts->release(streams);
-    delete luts;
-    luts = nullptr;
-
-    state->release(streams, allocate_gpu_memory);
-    delete state;
-    state = nullptr;
-
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-  }
-};
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
@@ -30,10 +30,15 @@ template <typename Torus> struct int_trivium_lut_buffers {
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };

+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
+
    auto active_streams_and =
        streams.active_gpu_subset(total_lut_ops, params.pbs_type);
-    this->and_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_and, {0}, {and_lambda}, allocate_gpu_memory);
+    this->and_lut->broadcast_lut(active_streams_and);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
@@ -45,10 +50,15 @@ template <typename Torus> struct int_trivium_lut_buffers {
      return x & 1;
    };

+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+
    auto active_streams_flush =
        streams.active_gpu_subset(total_flush_ops, params.pbs_type);
-    this->flush_lut->generate_and_broadcast_lut(
-        active_streams_flush, {0}, {flush_lambda}, allocate_gpu_memory);
+    this->flush_lut->broadcast_lut(active_streams_flush);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
    uint32_t casting_output_dimension, uint32_t casting_ks_level,
    uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
-    const bool *is_boolean_array, const uint32_t is_boolean_array_len,
-    uint32_t num_compact_lists, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
-    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
+    const bool *is_boolean_array, uint32_t num_compact_lists,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    KS_TYPE casting_key_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -118,8 +118,7 @@ template <typename Torus> struct zk_expand_mem {
  zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
-                const bool *is_boolean_array,
-                const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
+                const bool *is_boolean_array, uint32_t num_compact_lists,
                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
@@ -175,6 +174,40 @@ template <typename Torus> struct zk_expand_mem {
    message_and_carry_extract_luts = new int_radix_lut<Torus>(
        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);

+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 0),
+        message_and_carry_extract_luts->get_degree(0),
+        message_and_carry_extract_luts->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 1),
+        message_and_carry_extract_luts->get_degree(1),
+        message_and_carry_extract_luts->get_max_degree(1),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 2),
+        message_and_carry_extract_luts->get_degree(2),
+        message_and_carry_extract_luts->get_max_degree(2),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
+        gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 3),
+        message_and_carry_extract_luts->get_degree(3),
+        message_and_carry_extract_luts->get_max_degree(3),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
+        gpu_memory_allocated);
+
    // We are always packing two LWEs. We just need to be sure we have enough
    // space in the carry part to store a message of the same size as is in the
    // message part.
@@ -237,36 +270,14 @@ template <typename Torus> struct zk_expand_mem {
      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
        auto lwe_index = i + num_packed_msgs * offset;
        auto lwe_index_in_list = i % num_lwes_in_kth;
-        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
-                       "Cuda error: index %d is beyond the max value %d",
-                       lwe_index, num_packed_msgs * num_lwes);
        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
        h_indexes_out[lwe_index] =
            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %d is beyond the max value %d",
-                       h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
-                       "Cuda error: index %d is beyond the max value %d",
-                       h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
-        // is_boolean_array tells us which input is a boolean and thus the
-        // related output needs boolean sanitization. It naturally has
-        // total_blocks entries, but h_indexes_out reaches
-        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
-        // the ceiling causes out-of-bounds access. Reading garbage "true" would
-        // set h_lut_indexes to an invalid index pointing to uninitialized
-        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
-        // to match.
-        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
-                       "Cuda error: index %d for is_boolean_array is out of "
-                       "bounds (len is %d)",
-                       h_indexes_out[lwe_index], is_boolean_array_len);
+        // If the input relates to a boolean, shift the LUT so the correct one
+        // with sanitization is used
        auto boolean_offset =
            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
-        PANIC_IF_FALSE(
-            h_lut_indexes[lwe_index] < 4,
-            "Cuda error: lut index is greater than the max possible value (3)");
      }
      offset += num_lwes_in_kth;
    }
@@ -281,13 +292,7 @@ template <typename Torus> struct zk_expand_mem {

    auto active_streams =
        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
-
-    message_and_carry_extract_luts->generate_and_broadcast_lut(
-        active_streams, {0, 1, 2, 3},
-        {message_extract_lut_f, carry_extract_lut_f,
-         message_extract_and_sanitize_bool_lut_f,
-         carry_extract_and_sanitize_bool_lut_f},
-        gpu_memory_allocated);
+    message_and_carry_extract_luts->broadcast_lut(active_streams);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -6,14 +6,6 @@
 #include <cuda_profiler_api.h>
 #endif

-void validate_device_ptr(const void *ptr, uint32_t gpu_index) {
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
-  if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer.")
-  }
-}
-
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
@@ -257,7 +249,11 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  validate_device_ptr(dest, gpu_index);
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
+  }

  cuda_set_device(gpu_index);
  check_cuda_error(
@@ -353,7 +349,11 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  validate_device_ptr(dest, gpu_index);
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
+  }
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }
@@ -408,7 +408,11 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
  if (size == 0)
    return;
-  validate_device_ptr(src, gpu_index);
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, src));
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
+  }

  cuda_set_device(gpu_index);
  check_cuda_error(
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -68,15 +68,9 @@ struct alignas(16) f128 {
    auto t = two_sum(a.lo, b.lo);

    double hi = s.hi;
-#ifdef __CUDA_ARCH__
-    double lo = __dadd_rn(s.lo, t.hi);
-    hi = __dadd_rn(hi, lo);
-    lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
-#else
    double lo = s.lo + t.hi;
    hi = hi + lo;
    lo = lo - (hi - s.hi);
-#endif

    return f128(hi, lo + t.lo);
  }
@@ -110,13 +104,8 @@ struct alignas(16) f128 {
  __host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
    auto s = two_diff(a.hi, b.hi);
    auto t = two_diff(a.lo, b.lo);
-#ifdef __CUDA_ARCH__
-    s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
-    return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
-#else
    s = quick_two_sum(s.hi, s.lo + t.hi);
    return quick_two_sum(s.hi, s.lo + t.lo);
-#endif
  }

  // Multiplication
@@ -231,16 +220,16 @@ struct f128x2 {
  // Subtraction
  __host__ __device__ friend f128x2 operator-(const f128x2 &a,
                                              const f128x2 &b) {
-    return f128x2(f128::sub_estimate(a.re, b.re),
-                  f128::sub_estimate(a.im, b.im));
+    return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
+                  f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
  }

  // Multiplication (complex multiplication)
  __host__ __device__ friend f128x2 operator*(const f128x2 &a,
                                              const f128x2 &b) {
-    const f128 a_im_b_im = f128::mul(a.im, b.im);
    f128 real_part =
-        f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
+        f128::add(f128::mul(a.re, b.re),
+                  f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
    f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
    return f128x2(real_part, imag_part);
  }
@@ -254,8 +243,8 @@ struct f128x2 {

  // Subtraction-assignment operator
  __host__ __device__ f128x2 &operator-=(const f128x2 &other) {
-    re = f128::sub_estimate(re, other.re);
-    im = f128::sub_estimate(im, other.im);
+    re = f128::add(re, f128(-other.re.hi, -other.re.lo));
+    im = f128::add(im, f128(-other.im.hi, -other.im.lo));
    return *this;
  }

@@ -272,20 +261,12 @@ struct f128x2 {
 };

 __host__ __device__ inline uint64_t double_to_bits(double d) {
-#ifdef __CUDA_ARCH__
-  uint64_t bits = __double_as_longlong(d);
-#else
  uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
-#endif
  return bits;
 }

 __host__ __device__ inline double bits_to_double(uint64_t bits) {
-#ifdef __CUDA_ARCH__
-  double d = __longlong_as_double(bits);
-#else
  double d = *reinterpret_cast<double *>(&bits);
-#endif
  return d;
 }

@@ -294,8 +275,6 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
  const double A = ONE << 52;
  const double B = ONE << 104;
  const double C = ONE << 76;
-  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
-  // does not work here
  const double D = 340282366920938500000000000000000000000.;

  const __uint128_t threshold = (ONE << 104);
@@ -309,20 +288,15 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = A_bits | lower64;
    double l_temp = bits_to_double(bits_l);
+    double l = l_temp - A;

    uint64_t B_bits = double_to_bits(B);
    uint64_t top64 = static_cast<uint64_t>(x >> 52);
    uint64_t bits_h = B_bits | top64;
    double h_temp = bits_to_double(bits_h);
-
-#ifdef __CUDA_ARCH__
-    return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
-#else
-    double l = l_temp - A;
    double h = h_temp - B;

    return (l + h);
-#endif

  } else {
    uint64_t C_bits = double_to_bits(C);
@@ -336,20 +310,15 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = C_bits | lower64 | mask_part;
    double l_temp = bits_to_double(bits_l);
+    double l = l_temp - C;

    uint64_t D_bits = double_to_bits(D);
    uint64_t top64 = static_cast<uint64_t>(x >> 76);
    uint64_t bits_h = D_bits | top64;
    double h_temp = bits_to_double(bits_h);
-
-#ifdef __CUDA_ARCH__
-    return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
-#else
-    double l = l_temp - C;
    double h = h_temp - D;

    return (l + h);
-#endif
  }
 }

@@ -420,8 +389,6 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {

 __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  auto x = f128::sub_estimate(a, f128::f128_floor(a));
-  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
-  // does not work here
  const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__
  x.hi = __dmul_rn(x.hi, normalization);
@@ -431,7 +398,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  x.lo *= normalization;
 #endif

-  x = f128::add_estimate(x, f128(0.5, 0.0));
+  // TODO has to be round
  x = f128::f128_floor(x);

  __uint128_t x0 = f64_to_u128(x.hi);
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -12,9 +12,8 @@
 using Index = unsigned;

 #define NEG_TWID(i)                                                            \
-  f128x2(                                                                      \
-      f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])),  \
-      f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))
+  f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]),               \
+         f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))

 #define F64x4_TO_F128x2(f128x2_reg, ind)                                       \
  f128x2_reg.re.hi = dt_re_hi[ind];                                            \
@@ -219,7 +218,7 @@ __device__ void convert_u128_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re, const __uint128_t *in_im) {

-  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
+  const double normalization = pow(2., -128.);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
@@ -242,7 +241,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {

-  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
+  const double normalization = pow(2., -128.);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -134,6 +134,13 @@ __host__ void are_all_comparisons_block_true(
        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
          return x == chunk_length;
        };
+        generate_device_accumulator_with_cpu_prealloc<Torus>(
+            streams.stream(0), streams.gpu_index(0),
+            is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
+            is_max_value_lut->get_max_degree(1), glwe_dimension,
+            polynomial_size, message_modulus, carry_modulus,
+            is_equal_to_num_blocks_lut_f, true,
+            are_all_block_true_buffer->preallocated_h_lut);

        Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes;
        for (int index = 0; index < num_chunks; index++) {
@@ -148,10 +155,7 @@ __host__ void are_all_comparisons_block_true(
                                 streams.stream(0), streams.gpu_index(0));
        auto active_streams =
            streams.active_gpu_subset(num_chunks, params.pbs_type);
-
-        is_max_value_lut->generate_and_broadcast_lut(
-            active_streams, {1}, {is_equal_to_num_blocks_lut_f}, true, true,
-            {are_all_block_true_buffer->preallocated_h_lut});
+        is_max_value_lut->broadcast_lut(active_streams);
      }
      lut = is_max_value_lut;
    }
@@ -479,10 +483,14 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    y = x;
    f = sign_handler_f;
  }
+  generate_device_accumulator_with_cpu_prealloc<Torus>(
+      streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
+      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
+      polynomial_size, message_modulus, carry_modulus, f, true,
+      tree_buffer->preallocated_h_lut);

  auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-  last_lut->generate_and_broadcast_lut(active_streams, {0}, {f}, true, true,
-                                       {tree_buffer->preallocated_h_lut});
+  last_lut->broadcast_lut(active_streams);

  // Last leaf
  integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -12,121 +12,6 @@
 #include "polynomial/functions.cuh"
 #include "utils/kernel_dimensions.cuh"

-/*
- * =============================================================================
- * GPU Compression/Decompression Algorithm: Overview
- * =============================================================================
- *
- * The compression algorithm transforms standard LWE ciphertexts into a compact
- * packed format. Decompression reverses this process.
- *
- * -----------------------------------------------------------------------------
- * COMPRESSION INPUT (lwe_array_in)
- * -----------------------------------------------------------------------------
- *
- *  +-------------------------------------------------------------------------+
- *  |                    lwe_array_in (GPU memory)                            |
- *  +-------------------------------------------------------------------------+
- *  +---------------------------+---------------------------+-----------------+
- *  |          LWE 0            |          LWE 1            |      ...        |
- *  |      [mask, body]         |      [mask, body]         |                 |
- *  +---------------------------+---------------------------+-----------------+
- *  |<-- lwe_dimension + 1 -->|
- *
- *  Total LWEs: total_lwe_bodies_count (num_radix_blocks)
- *
- * -----------------------------------------------------------------------------
- * COMPRESSION PROCESS
- * -----------------------------------------------------------------------------
- *
- * 1. Message Shift (64-bit only):
- *    Each LWE is multiplied by message_modulus to shift the message to MSB
- *
- * 2. Packing Keyswitch (LWE -> GLWE):
- *    Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
- *
- *    +--------------------------------------------------------------+
- *    |   lwe_per_glwe LWEs (input batch)                            |
- *    |   LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1]                   |
- *    +--------------------------------------------------------------+
- *                              |
- *                    Packing Keyswitch
- *                              v
- *    +--------------------------------------------------------------+
- *    |            Single GLWE Ciphertext                            |
- *    |   [A_0, A_1, ..., A_{k-1}, B]                                |
- *    |   |<-- k * polynomial_size -->| |<-- polynomial_size -->|   |
- *    +--------------------------------------------------------------+
- *
- *    Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
- *                                             lwe_per_glwe)
- *
- * 3. Modulus Switch:
- *    Reduce precision from 64-bit torus to storage_log_modulus bits
- *
- * 4. Bit Packing:
- *    Pack multiple reduced-precision elements into dense bit representation
- *
- * -----------------------------------------------------------------------------
- * COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
- * -----------------------------------------------------------------------------
- *
- *  +-------------------------------------------------------------------------+
- *  |                 tmp_glwe_array_out (intermediate buffer)                |
- *  +-------------------------------------------------------------------------+
- *  +----------------------------+----------------------------+---------------+
- *  |         GLWE 0             |         GLWE 1             |    ...        |
- *  |  [A_0..A_{k-1}, B_0..B_N]  |  [A_0..A_{k-1}, B_0..B_N]  |               |
- *  +----------------------------+----------------------------+---------------+
- *       |<-- glwe_accumulator_size = (k+1)*N -->|
- *
- *  Total size needed: num_glwes * glwe_accumulator_size elements
- *  Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
- *
- * -----------------------------------------------------------------------------
- * PACKED OUTPUT (glwe_array_out)
- * -----------------------------------------------------------------------------
- *
- *  +-------------------------------------------------------------------------+
- *  |              Packed GLWE Ciphertext List (bit-packed)                   |
- *  +-------------------------------------------------------------------------+
- *  +-------------------------------------------------------------------------+
- *  |  Elements packed with storage_log_modulus bits per original element    |
- *  |  Total packed size: ceil(in_len * storage_log_modulus / 64) elements   |
- *  +-------------------------------------------------------------------------+
- *
- * =============================================================================
- * DECOMPRESSION (Extract) Algorithm
- * =============================================================================
- *
- * Decompression receives an array of LWE indexes. For each index, it identifies
- * the corresponding GLWE, extracts that GLWE from the packed representation,
- * and then sample-extracts the requested LWE from the GLWE.
- *
- * -----------------------------------------------------------------------------
- * EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
- * -----------------------------------------------------------------------------
- *
- *  +-------------------------------------------------------------------------+
- *  |               Extracted GLWE Ciphertext                                 |
- *  +-------------------------------------------------------------------------+
- *  +---------------------------------------+-----------------+---------------+
- *  |    Mask (A polynomials)               |   Body (B)      |    Tail       |
- *  |    [A_0, ..., A_{k-1}]                |   (body_count)  |   (zeroed)    |
- *  |    k * polynomial_size elements       |   elements      |   elements    |
- *  +---------------------------------------+-----------------+---------------+
- *  |<------------------- initial_out_len ------------------->|               |
- *  |<------------------------ glwe_ciphertext_size ------------------------->|
- *
- *  For the last GLWE, body_count may be less than polynomial_size (partial).
- *  The tail region must be zeroed to ensure defined behavior.
- *
- *  tail_size = glwe_ciphertext_size - initial_out_len
- *  tail_offset = initial_out_len  (NOT 0!)
- *
- * =============================================================================
- */
-
 template <typename Torus>
 __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
                     uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
@@ -223,8 +108,6 @@ host_integer_compress(CudaStreams streams,
  uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
                        glwe_array_out->lwe_per_glwe - 1) /
                       glwe_array_out->lwe_per_glwe;
-  PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
-                 "Invalid number of GLWEs");

  // Keyswitch LWEs to GLWE
  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -346,11 +229,8 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;

  // Ensure the tail of the GLWE is zeroed
-  // The extract kernel writes initial_out_len elements starting at offset 0.
-  // We must zero the tail region (from initial_out_len to
-  // glwe_ciphertext_size)
  if (initial_out_len < glwe_ciphertext_size) {
-    cuda_memset_async(glwe_array_out + initial_out_len, 0,
+    cuda_memset_async(glwe_array_out, 0,
                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
                      stream, gpu_index);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -962,9 +962,8 @@ uint64_t generate_many_lookup_table(
 template <typename Torus>
 void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,
                                       uint32_t polynomial_size,
-                                       std::function<Torus(Torus)> f) {
+                                       std::function<Torus(uint32_t)> f) {

-  // accumulator number of elements is (glwe_dimension + 1) * polynomial_size
  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

  auto body = &acc[glwe_dimension * polynomial_size];
@@ -976,9 +975,9 @@ void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,

 template <typename Torus>
 void generate_device_accumulator_no_encoding(
-    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t &degree,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t polynomial_size, std::function<Torus(Torus)> f,
+    uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
    bool gpu_memory_allocated) {

  Torus *h_lut =
@@ -987,7 +986,7 @@ void generate_device_accumulator_no_encoding(
  generate_lookup_table_no_encoding<Torus>(h_lut, glwe_dimension,
                                           polynomial_size, f);

-  *degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;
+  degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;

  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -1739,9 +1738,12 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
      signs_array_in, 0, num_sign_blocks);
  if (num_sign_blocks > 2) {
    auto lut = diff_buffer->reduce_signs_lut;
-    lut->generate_and_broadcast_lut(lut->active_streams, {0},
-                                    {reduce_two_orderings_function}, true, true,
-                                    {diff_buffer->preallocated_h_lut1});
+    generate_device_accumulator_with_cpu_prealloc<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        reduce_two_orderings_function, true, diff_buffer->preallocated_h_lut1);
+    lut->broadcast_lut(lut->active_streams);

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
@@ -1767,10 +1769,12 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = diff_buffer->reduce_signs_lut;
-
-    lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
-                                    true, true,
-                                    {diff_buffer->preallocated_h_lut2});
+    generate_device_accumulator_with_cpu_prealloc<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
+        diff_buffer->preallocated_h_lut2);
+    lut->broadcast_lut(lut->active_streams);

    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
                       signs_a, num_sign_blocks, message_modulus);
@@ -1785,9 +1789,12 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
-    lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
-                                    true, true,
-                                    {diff_buffer->preallocated_h_lut2});
+    generate_device_accumulator_with_cpu_prealloc<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
+        diff_buffer->preallocated_h_lut2);
+    lut->broadcast_lut(lut->active_streams);

    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -141,10 +141,13 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
+    generate_device_accumulator_with_cpu_prealloc<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
+        true, mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-    lut->generate_and_broadcast_lut(
-        active_streams, {0}, {scalar_last_leaf_lut_f}, true, true,
-        {mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut});
+    lut->broadcast_lut(active_streams);

    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);
@@ -231,10 +234,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
    };

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
+    generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f, true,
+        mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-    lut->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {scalar_bivariate_last_leaf_lut_f}, true,
-        {mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut});
+    lut->broadcast_lut(active_streams);

    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
@@ -261,10 +268,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
      int_radix_lut<Torus> *one_block_lut =
          new int_radix_lut<Torus>(streams, params, 1, 1, true, size);

+      generate_device_accumulator_with_cpu_prealloc<Torus>(
+          streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
+          one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, one_block_lut_f, true,
+          mem_ptr->preallocated_h_lut);
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      one_block_lut->generate_and_broadcast_lut(active_streams, {0},
-                                                {one_block_lut_f}, true, true,
-                                                {mem_ptr->preallocated_h_lut});
+      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
@@ -402,11 +413,14 @@ __host__ void integer_radix_signed_scalar_difference_check(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-
+    generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f, true,
+        mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-    lut->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {scalar_bivariate_last_leaf_lut_f}, true,
-        {mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut});
+    lut->broadcast_lut(active_streams);

    integer_radix_apply_bivariate_lookup_table<Torus>(
        streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks, lut,
@@ -501,9 +515,14 @@ __host__ void integer_radix_signed_scalar_difference_check(
    };

    auto signed_msb_lut = mem_ptr->signed_msb_lut;
-    auto msb_active_streams = msb_streams.active_gpu_subset(1, params.pbs_type);
-    signed_msb_lut->generate_and_broadcast_bivariate_lut(
-        msb_active_streams, {0}, {lut_f}, true, {mem_ptr->preallocated_h_lut});
+    generate_device_accumulator_bivariate_with_cpu_prealloc<Torus>(
+        msb_streams.stream(0), streams.gpu_index(0),
+        signed_msb_lut->get_lut(0, 0), signed_msb_lut->get_degree(0),
+        signed_msb_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_f, true, mem_ptr->preallocated_h_lut);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
+    signed_msb_lut->broadcast_lut(active_streams);

    CudaRadixCiphertextFFI sign_block;
    as_radix_ciphertext_slice<Torus>(
@@ -542,10 +561,14 @@ __host__ void integer_radix_signed_scalar_difference_check(
      int_radix_lut<Torus> *one_block_lut =
          new int_radix_lut<Torus>(streams, params, 1, 1, true, size);

+      generate_device_accumulator_with_cpu_prealloc<Torus>(
+          streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
+          one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, one_block_lut_f, true,
+          mem_ptr->preallocated_h_lut);
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      one_block_lut->generate_and_broadcast_lut(active_streams, {0},
-                                                {one_block_lut_f}, true, true,
-                                                {mem_ptr->preallocated_h_lut});
+      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table<Torus>(
          streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
@@ -175,10 +175,6 @@ __host__ void host_aggregate_one_hot_vector(
    Torus *const *ksks) {

  int_radix_params params = mem_ptr->params;
-  if (params.message_modulus > 4 && params.carry_modulus > 4) {
-    PANIC("Cuda error: aggregate one hot vector is only implemented for 1_1 "
-          "and 2_2 params");
-  }
  uint32_t chunk_size = mem_ptr->chunk_size;
  uint32_t num_streams = mem_ptr->num_streams;

@@ -259,10 +255,7 @@ __host__ void host_aggregate_one_hot_vector(

  //
  // Aggregate partial results from all streams into the final aggregated vector
-  // num_streams has to be less than the max noise level otherwise we accumulate
-  // too much and the noise limit is exceeded
  //
-  CHECK_NOISE_LEVEL(num_streams, params.message_modulus, params.carry_modulus);
  for (uint32_t s = 1; s < num_streams; s++) {
    uint32_t start_idx = s * inputs_per_stream;
    if (start_idx >= num_input_ciphertexts)
--- a/backends/tfhe-cuda-backend/cuda/src/kreyvium/kreyvium.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/kreyvium/kreyvium.cu
@@ -1,45 +0,0 @@
-#include "../../include/kreyvium/kreyvium.h"
-#include "kreyvium.cuh"
-
-uint64_t scratch_cuda_kreyvium_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
-
-  return scratch_cuda_kreyvium_encrypt<uint64_t>(
-      CudaStreams(streams), (int_kreyvium_buffer<uint64_t> **)mem_ptr, params,
-      allocate_gpu_memory, num_inputs);
-}
-
-void cuda_kreyvium_generate_keystream_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
-    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
-    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
-
-  auto buffer = (int_kreyvium_buffer<uint64_t> *)mem_ptr;
-
-  host_kreyvium_generate_keystream<uint64_t>(
-      CudaStreams(streams), keystream_output, key, iv, num_inputs, num_steps,
-      buffer, bsks, (uint64_t *const *)ksks);
-}
-
-void cleanup_cuda_kreyvium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
-
-  int_kreyvium_buffer<uint64_t> *mem_ptr =
-      (int_kreyvium_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(CudaStreams(streams));
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
--- a/backends/tfhe-cuda-backend/cuda/src/kreyvium/kreyvium.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/kreyvium/kreyvium.cuh
@@ -1,358 +0,0 @@
-#ifndef KREYVIUM_CUH
-#define KREYVIUM_CUH
-
-#include "../../include/kreyvium/kreyvium_utilities.h"
-#include "../integer/integer.cuh"
-#include "../integer/radix_ciphertext.cuh"
-#include "../integer/scalar_addition.cuh"
-#include "../linearalgebra/addition.cuh"
-
-// Creates a view (slice) of specific bits in a register.
-// Used to access specific taps like a[65], k[127], etc.
-template <typename Torus>
-__host__ void slice_reg_batch_kreyvium(CudaRadixCiphertextFFI *slice,
-                                       const CudaRadixCiphertextFFI *reg,
-                                       uint32_t start_bit_idx,
-                                       uint32_t num_bits, uint32_t num_inputs) {
-  as_radix_ciphertext_slice<Torus>(slice, reg, start_bit_idx * num_inputs,
-                                   (start_bit_idx + num_bits) * num_inputs);
-}
-
-// Standard shift-and-insert for Kreyvium registers A, B, C.
-// Shifts the register and inserts new bits at the start.
-template <typename Torus>
-__host__ void shift_and_insert_batch_kreyvium(CudaStreams streams,
-                                              int_kreyvium_buffer<Torus> *mem,
-                                              CudaRadixCiphertextFFI *reg,
-                                              CudaRadixCiphertextFFI *new_bits,
-                                              uint32_t reg_size,
-                                              uint32_t num_inputs) {
-  constexpr uint32_t BATCH = KREYVIUM_BATCH_SIZE;
-  CudaRadixCiphertextFFI *temp = mem->state->shift_workspace;
-  uint32_t num_blocks_to_keep = (reg_size - BATCH) * num_inputs;
-
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams.stream(0), streams.gpu_index(0), temp, 0, num_blocks_to_keep, reg,
-      BATCH * num_inputs, reg_size * num_inputs);
-
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams.stream(0), streams.gpu_index(0), temp, num_blocks_to_keep,
-      reg_size * num_inputs, new_bits, 0, BATCH * num_inputs);
-
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams.stream(0), streams.gpu_index(0), reg, 0, reg_size * num_inputs,
-      temp, 0, reg_size * num_inputs);
-}
-
-// Reverses the order of blocks in a ciphertext buffer.
-// Essential for aligning Key/IV bit ordering.
-template <typename Torus>
-void reverse_bitsliced_radix_inplace_kreyvium(CudaStreams streams,
-                                              int_kreyvium_buffer<Torus> *mem,
-                                              CudaRadixCiphertextFFI *radix,
-                                              uint32_t num_bits_in_reg) {
-  uint32_t N = mem->num_inputs;
-  CudaRadixCiphertextFFI *temp = mem->state->shift_workspace;
-
-  for (uint32_t i = 0; i < num_bits_in_reg; i++) {
-    uint32_t src_start = i * N;
-    uint32_t src_end = (i + 1) * N;
-    uint32_t dest_start = (num_bits_in_reg - 1 - i) * N;
-    uint32_t dest_end = (num_bits_in_reg - i) * N;
-
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), temp, dest_start, dest_end,
-        radix, src_start, src_end);
-  }
-
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams.stream(0), streams.gpu_index(0), radix, 0, num_bits_in_reg * N,
-      temp, 0, num_bits_in_reg * N);
-}
-
-// Core Kreyvium step function: computes 64 steps in parallel.
-// Includes XORs, AND gates (via PBS), Key/IV rotation, and register updates.
-template <typename Torus>
-__host__ void
-kreyvium_compute_64_steps(CudaStreams streams, int_kreyvium_buffer<Torus> *mem,
-                          CudaRadixCiphertextFFI *output_dest,
-                          void *const *bsks, uint64_t *const *ksks) {
-
-  uint32_t N = mem->num_inputs;
-  constexpr uint32_t BATCH = KREYVIUM_BATCH_SIZE;
-  uint32_t batch_size_blocks = BATCH * N;
-  auto s = mem->state;
-  auto luts = mem->luts;
-
-  // Extract register taps for A (93-bit register)
-  CudaRadixCiphertextFFI a65, a92, a91, a90, a68;
-  slice_reg_batch_kreyvium<Torus>(&a65, s->a_reg, 27, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&a92, s->a_reg, 0, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&a91, s->a_reg, 1, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&a90, s->a_reg, 2, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&a68, s->a_reg, 24, BATCH, N);
-
-  // Extract register taps for B (84-bit register)
-  CudaRadixCiphertextFFI b68, b83, b82, b81, b77;
-  slice_reg_batch_kreyvium<Torus>(&b68, s->b_reg, 15, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&b83, s->b_reg, 0, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&b82, s->b_reg, 1, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&b81, s->b_reg, 2, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&b77, s->b_reg, 6, BATCH, N);
-
-  // Extract register taps for C (111-bit register)
-  CudaRadixCiphertextFFI c65, c110, c109, c108, c86;
-  slice_reg_batch_kreyvium<Torus>(&c65, s->c_reg, 45, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&c110, s->c_reg, 0, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&c109, s->c_reg, 1, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&c108, s->c_reg, 2, BATCH, N);
-  slice_reg_batch_kreyvium<Torus>(&c86, s->c_reg, 24, BATCH, N);
-
-  // Extract key and IV bits using virtual rotation offset
-  CudaRadixCiphertextFFI k127, iv127;
-  slice_reg_batch_kreyvium<Torus>(&k127, s->k_reg, s->k_offset, 64, N);
-  slice_reg_batch_kreyvium<Torus>(&iv127, s->iv_reg, s->iv_offset, 64, N);
-  s->k_offset = (s->k_offset + 64) % 128;
-  s->iv_offset = (s->iv_offset + 64) % 128;
-
-  // Compute linear feedback terms:
-  // temp_a = a65 + a92
-  // temp_b = b68 + b83
-  // temp_c = c65 + c110 + k127
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), s->temp_a, &a65,
-                       &a92, s->temp_a->num_radix_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), s->temp_b, &b68,
-                       &b83, s->temp_b->num_radix_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), s->temp_c, &c65,
-                       &c110, s->temp_c->num_radix_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), s->temp_c,
-                       s->temp_c, &k127, s->temp_c->num_radix_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-
-  // Pack AND gate inputs: (c109 & c108), (a91 & a90), (b82 & b81)
-  CudaRadixCiphertextFFI *lhs_ptrs[] = {&c109, &a91, &b82};
-  CudaRadixCiphertextFFI *rhs_ptrs[] = {&c108, &a90, &b81};
-
-  for (uint32_t i = 0; i < KREYVIUM_NUM_AND_GATES; i++) {
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), s->packed_and_lhs,
-        i * batch_size_blocks, (i + 1) * batch_size_blocks, lhs_ptrs[i], 0,
-        batch_size_blocks);
-
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), s->packed_and_rhs,
-        i * batch_size_blocks, (i + 1) * batch_size_blocks, rhs_ptrs[i], 0,
-        batch_size_blocks);
-  }
-
-  // Execute 3 AND gates in parallel via bivariate PBS
-  integer_radix_apply_bivariate_lookup_table<Torus>(
-      streams, s->packed_and_out, s->packed_and_lhs, s->packed_and_rhs, bsks,
-      ksks, luts->and_lut, KREYVIUM_NUM_AND_GATES * batch_size_blocks,
-      mem->params.message_modulus);
-
-  // Unpack AND results
-  CudaRadixCiphertextFFI and_c109_c108, and_a91_a90, and_b82_b81;
-  CudaRadixCiphertextFFI *and_out_ptrs[] = {&and_c109_c108, &and_a91_a90,
-                                            &and_b82_b81};
-
-  for (uint32_t i = 0; i < KREYVIUM_NUM_AND_GATES; i++) {
-    as_radix_ciphertext_slice<Torus>(and_out_ptrs[i], s->packed_and_out,
-                                     i * batch_size_blocks,
-                                     (i + 1) * batch_size_blocks);
-  }
-
-  // Create slices pointing directly into flush input buffer
-  // We utilize a loop here to slice the packed buffer into 4 distinct views
-  CudaRadixCiphertextFFI flush_new_a, flush_new_b, flush_new_c, flush_out;
-  CudaRadixCiphertextFFI *flush_in_slices[] = {&flush_new_a, &flush_new_b,
-                                               &flush_new_c, &flush_out};
-
-  for (uint32_t i = 0; i < KREYVIUM_NUM_FLUSH_PATHS; i++) {
-    as_radix_ciphertext_slice<Torus>(flush_in_slices[i], s->packed_flush_in,
-                                     i * batch_size_blocks,
-                                     (i + 1) * batch_size_blocks);
-  }
-
-  // new_a = (c109 & c108) + a68 + temp_c
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_new_a,
-                       &and_c109_c108, &a68, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_new_a,
-                       &flush_new_a, s->temp_c, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-
-  // new_b = (a91 & a90) + b77 + temp_a + iv127
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_new_b,
-                       &and_a91_a90, &b77, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_new_b,
-                       &flush_new_b, s->temp_a, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_new_b,
-                       &flush_new_b, &iv127, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-
-  // new_c = (b82 & b81) + c86 + temp_b
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_new_c,
-                       &and_b82_b81, &c86, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_new_c,
-                       &flush_new_c, s->temp_b, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-
-  // out = temp_a + temp_b + temp_c
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_out,
-                       s->temp_a, s->temp_b, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), &flush_out,
-                       &flush_out, s->temp_c, batch_size_blocks,
-                       mem->params.message_modulus, mem->params.carry_modulus);
-
-  // Apply flush PBS to extract message bits and reset noise
-  integer_radix_apply_univariate_lookup_table<Torus>(
-      streams, s->packed_flush_out, s->packed_flush_in, bsks, ksks,
-      luts->flush_lut, KREYVIUM_NUM_FLUSH_PATHS * batch_size_blocks);
-
-  // Unpack flushed results
-  CudaRadixCiphertextFFI flushed_new_a, flushed_new_b, flushed_new_c,
-      flushed_out;
-  CudaRadixCiphertextFFI *flush_out_slices[] = {&flushed_new_a, &flushed_new_b,
-                                                &flushed_new_c, &flushed_out};
-
-  for (uint32_t i = 0; i < KREYVIUM_NUM_FLUSH_PATHS; i++) {
-    as_radix_ciphertext_slice<Torus>(flush_out_slices[i], s->packed_flush_out,
-                                     i * batch_size_blocks,
-                                     (i + 1) * batch_size_blocks);
-  }
-
-  // Update registers: shift and insert new 64 bits
-  shift_and_insert_batch_kreyvium(streams, mem, s->a_reg, &flushed_new_a, 93,
-                                  N);
-  shift_and_insert_batch_kreyvium(streams, mem, s->b_reg, &flushed_new_b, 84,
-                                  N);
-  shift_and_insert_batch_kreyvium(streams, mem, s->c_reg, &flushed_new_c, 111,
-                                  N);
-
-  // Copy output keystream if destination provided
-  if (output_dest != nullptr) {
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), output_dest, 0,
-        batch_size_blocks, &flushed_out, 0, batch_size_blocks);
-  }
-}
-
-// Initialization phase: Loads Key/IV, distributes bits to registers A, B, C,
-// and runs the warm-up loop.
-template <typename Torus>
-__host__ void kreyvium_init(CudaStreams streams,
-                            int_kreyvium_buffer<Torus> *mem,
-                            CudaRadixCiphertextFFI const *key_bitsliced,
-                            CudaRadixCiphertextFFI const *iv_bitsliced,
-                            void *const *bsks, uint64_t *const *ksks) {
-  uint32_t N = mem->num_inputs;
-  auto s = mem->state;
-  s->k_offset = 0;
-  s->iv_offset = 0;
-
-  CudaRadixCiphertextFFI src_key_slice;
-  slice_reg_batch_kreyvium<Torus>(&src_key_slice, key_bitsliced, 0, 128, N);
-
-  CudaRadixCiphertextFFI dest_k_reg_slice;
-  slice_reg_batch_kreyvium<Torus>(&dest_k_reg_slice, s->k_reg, 0, 128, N);
-  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                     &dest_k_reg_slice, &src_key_slice);
-
-  CudaRadixCiphertextFFI k_source_for_a;
-  slice_reg_batch_kreyvium<Torus>(&k_source_for_a, s->k_reg, 35, 93, N);
-
-  CudaRadixCiphertextFFI dest_a_slice;
-  slice_reg_batch_kreyvium<Torus>(&dest_a_slice, s->a_reg, 0, 93, N);
-  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                     &dest_a_slice, &k_source_for_a);
-
-  reverse_bitsliced_radix_inplace_kreyvium<Torus>(streams, mem, s->k_reg, 128);
-
-  CudaRadixCiphertextFFI src_iv_slice;
-  slice_reg_batch_kreyvium<Torus>(&src_iv_slice, iv_bitsliced, 0, 128, N);
-
-  CudaRadixCiphertextFFI dest_iv_reg_slice;
-  slice_reg_batch_kreyvium<Torus>(&dest_iv_reg_slice, s->iv_reg, 0, 128, N);
-  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                     &dest_iv_reg_slice, &src_iv_slice);
-
-  CudaRadixCiphertextFFI iv_source_for_b;
-  slice_reg_batch_kreyvium<Torus>(&iv_source_for_b, s->iv_reg, 44, 84, N);
-
-  CudaRadixCiphertextFFI dest_b_slice;
-  slice_reg_batch_kreyvium<Torus>(&dest_b_slice, s->b_reg, 0, 84, N);
-  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                     &dest_b_slice, &iv_source_for_b);
-
-  CudaRadixCiphertextFFI iv_source_for_c;
-  slice_reg_batch_kreyvium<Torus>(&iv_source_for_c, s->iv_reg, 0, 44, N);
-
-  CudaRadixCiphertextFFI dest_c_iv_part;
-  slice_reg_batch_kreyvium<Torus>(&dest_c_iv_part, s->c_reg, 67, 44, N);
-  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                     &dest_c_iv_part, &iv_source_for_c);
-
-  reverse_bitsliced_radix_inplace_kreyvium<Torus>(streams, mem, s->iv_reg, 128);
-
-  CudaRadixCiphertextFFI dest_c_ones;
-  slice_reg_batch_kreyvium<Torus>(&dest_c_ones, s->c_reg, 1, 66, N);
-  host_add_scalar_one_inplace<Torus>(streams, &dest_c_ones,
-                                     mem->params.message_modulus,
-                                     mem->params.carry_modulus);
-
-  integer_radix_apply_univariate_lookup_table<Torus>(
-      streams, &dest_c_ones, &dest_c_ones, bsks, ksks, mem->luts->flush_lut,
-      dest_c_ones.num_radix_blocks);
-
-  for (int i = 0; i < 18; i++) {
-    kreyvium_compute_64_steps(streams, mem, nullptr, bsks, ksks);
-  }
-}
-
-// Main entry point: Generates keystream in batches of 64 steps.
-template <typename Torus>
-__host__ void host_kreyvium_generate_keystream(
-    CudaStreams streams, CudaRadixCiphertextFFI *keystream_output,
-    CudaRadixCiphertextFFI const *key_bitsliced,
-    CudaRadixCiphertextFFI const *iv_bitsliced, uint32_t num_inputs,
-    uint32_t num_steps, int_kreyvium_buffer<Torus> *mem, void *const *bsks,
-    uint64_t *const *ksks) {
-
-  PANIC_IF_FALSE(
-      num_steps % KREYVIUM_BATCH_SIZE == 0,
-      "Kreyvium Error: num_steps must be a multiple of the batch size (64).\n");
-
-  kreyvium_init(streams, mem, key_bitsliced, iv_bitsliced, bsks, ksks);
-
-  uint32_t num_batches = num_steps / KREYVIUM_BATCH_SIZE;
-  for (uint32_t i = 0; i < num_batches; i++) {
-    CudaRadixCiphertextFFI batch_out_slice;
-    slice_reg_batch_kreyvium<Torus>(&batch_out_slice, keystream_output,
-                                    i * KREYVIUM_BATCH_SIZE,
-                                    KREYVIUM_BATCH_SIZE, num_inputs);
-    kreyvium_compute_64_steps(streams, mem, &batch_out_slice, bsks, ksks);
-  }
-}
-
-template <typename Torus>
-uint64_t scratch_cuda_kreyvium_encrypt(CudaStreams streams,
-                                       int_kreyvium_buffer<Torus> **mem_ptr,
-                                       int_radix_params params,
-                                       bool allocate_gpu_memory,
-                                       uint32_t num_inputs) {
-
-  uint64_t size_tracker = 0;
-  *mem_ptr = new int_kreyvium_buffer<Torus>(
-      streams, params, allocate_gpu_memory, num_inputs, size_tracker);
-  return size_tracker;
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -211,8 +211,6 @@ __global__ void device_programmable_bootstrap_amortized(
  // For the mask it's more complicated
  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
                                     glwe_dimension);
-
-  // No need to sync here, it is already synchronized after add_to_torus
  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
                                     glwe_dimension);
 }
@@ -305,9 +303,7 @@ __host__ void host_programmable_bootstrap_amortized(
    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count) {
-  PANIC_IF_FALSE(sizeof(Torus) == 8,
-                 "Error: Programmable bootstrap amortized only supports 64-bit "
-                 "Torus type.");
+
  uint64_t SM_FULL =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size, glwe_dimension);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -173,7 +173,6 @@ __global__ void device_programmable_bootstrap_cg(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
-      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -185,8 +184,7 @@ __global__ void device_programmable_bootstrap_cg(
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-          // No need to sync, it is already synchronized before the first
-          // sample_extract_body call
+
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -249,9 +247,7 @@ __host__ void host_programmable_bootstrap_cg(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    uint32_t num_many_lut, uint32_t lut_stride) {
-  PANIC_IF_FALSE(sizeof(Torus) == 8,
-                 "Error: Programmable bootstrap cg only supports 64-bit "
-                 "Torus type.");
+
  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
  uint64_t full_sm =
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -151,8 +151,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }

      } else if (blockIdx.y == glwe_dimension) {
-        __syncthreads();
+
        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {

@@ -164,8 +165,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];

-            // No need to sync, it is already synchronized before the first
-            // sample_extract_body call
            sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                               accumulator, 0, i * lut_stride);
          }
@@ -303,10 +302,7 @@ __host__ void execute_cg_external_product_loop(
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
  cuda_set_device(gpu_index);
-  PANIC_IF_FALSE(
-      sizeof(Torus) == 8,
-      "Error: Programmable bootstrap multi-bit cg only supports 64-bit "
-      "Torus type.");
+
  uint64_t full_sm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -239,7 +239,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
-      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -252,8 +251,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

-          // No need to sync, it is already synchronized before the first
-          // sample_extract_body call
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -412,9 +409,7 @@ __host__ void execute_step_one(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-  PANIC_IF_FALSE(sizeof(Torus) == 8,
-                 "Error: Programmable bootstrap step one only supports 64-bit "
-                 "Torus type.");
+
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
@@ -456,9 +451,7 @@ __host__ void execute_step_two(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    uint32_t num_many_lut, uint32_t lut_stride) {
-  PANIC_IF_FALSE(sizeof(Torus) == 8,
-                 "Error: Programmable bootstrap step two only supports 64-bit "
-                 "Torus type.");
+
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -240,13 +240,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // in case they're not synchronized
      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
    } else if (blockIdx.y == glwe_dimension) {
-      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
    }
  } else {
-    // We don't sync here because we use same indexes to read from `accumulator`
-    // as it was used in `add_to_torus_128` to write inside it Persist the
-    // updated accumulator
+    // Persist the updated accumulator
    tid = threadIdx.x;
    for (int i = 0; i < params::opt; i++) {
      global_slice[tid] = accumulator[tid];
@@ -398,7 +395,6 @@ __global__ void device_programmable_bootstrap_cg_128(
                                               accumulator);

    } else if (blockIdx.y == glwe_dimension) {
-      __syncthreads();
      sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
                                               0);
    }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -461,7 +461,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
-      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -474,7 +473,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

-          // No need to sync here, it is already synchronized after add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             global_slice, 0, i * lut_stride);
        }
@@ -665,9 +663,7 @@ __host__ void execute_compute_keybundle(
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
  cuda_set_device(gpu_index);
-  PANIC_IF_FALSE(sizeof(Torus) == 8,
-                 "Error: PBS keybundle only supports 64-bit "
-                 "Torus type.");
+
  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint64_t chunk_size = std::min(
      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
@@ -739,10 +735,7 @@ __host__ void execute_step_one(
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
  cuda_set_device(gpu_index);
-  PANIC_IF_FALSE(
-      sizeof(Torus) == 8,
-      "Error: Programmable bootstrap multi-bit step one only supports 64-bit "
-      "Torus type.");
+
  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
          polynomial_size);
@@ -796,10 +789,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
                 uint32_t level_count, uint32_t j, uint32_t num_many_lut,
                 uint32_t lut_stride) {
  cuda_set_device(gpu_index);
-  PANIC_IF_FALSE(
-      sizeof(Torus) == 8,
-      "Error: Programmable bootstrap multi-bit step two only supports 64-bit "
-      "Torus type.");
+
  uint32_t lwe_chunk_size = (uint32_t)(buffer->lwe_chunk_size);
  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -333,7 +333,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
-      __syncthreads();
      sample_extract_body<__uint128_t, params>(block_lwe_array_out,
                                               global_slice, 0);
      if (num_many_lut > 1) {
@@ -347,8 +346,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

-          // No need to sync, it is already synchronized before the first
-          // sample_extract_body call
          sample_extract_body<__uint128_t, params>(
              next_block_lwe_array_out, global_slice, 0, i * lut_stride);
        }
@@ -508,9 +505,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }

      } else if (blockIdx.y == glwe_dimension) {
-        __syncthreads();
+
        sample_extract_body<__uint128_t, params>(block_lwe_array_out,
                                                 accumulator, 0);
+
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {

@@ -521,8 +519,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];
-            // No need to sync, it is already synchronized before the first
-            // sample_extract_body call
+
            sample_extract_body<__uint128_t, params>(
                next_block_lwe_array_out, accumulator, 0, i * lut_stride);
          }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -179,7 +179,6 @@ __global__ void device_programmable_bootstrap_tbc(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
-      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

      if (num_many_lut > 1) {
@@ -192,8 +191,7 @@ __global__ void device_programmable_bootstrap_tbc(
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-          // No need to sync, it is already synchronized before the first
-          // sample_extract_body call
+
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -362,7 +360,6 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
-      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

      if (num_many_lut > 1) {
@@ -376,7 +373,6 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

-          // No need to sync here, it is already synchronized after add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -462,9 +458,6 @@ __host__ void host_programmable_bootstrap_tbc(
    uint32_t num_many_lut, uint32_t lut_stride) {
  cuda_set_device(gpu_index);

-  PANIC_IF_FALSE(sizeof(Torus) == 8,
-                 "Error: Programmable bootstrap tbc only supports 64-bit "
-                 "Torus type.");
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  auto supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -156,7 +156,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
          }
        }
      } else if (blockIdx.y == glwe_dimension) {
-        __syncthreads();
        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {
@@ -168,8 +167,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];
-            // No need to sync, it is already synchronized before the first
-            // sample_extract_body call
+
            sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                               accumulator, 0, i * lut_stride);
          }
@@ -358,7 +356,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
-      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -370,8 +367,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-          // No need to sync here, it is already synchronized after
-          // add_to_torus
+
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -527,11 +523,6 @@ __host__ void execute_tbc_external_product_loop(
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
-
-  PANIC_IF_FALSE(
-      sizeof(Torus) == 8,
-      "Error: Programmable bootstrap multi-bit tbc only supports 64-bit "
-      "Torus type.");
  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -339,10 +339,8 @@ template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
                                    uint32_t glwe_dimension, uint32_t nth = 0) {
  // Set first coefficient of the glwe as the body of the LWE sample
-  if (threadIdx.x == 0) {
-    lwe_array_out[glwe_dimension * params::degree] =
-        glwe[glwe_dimension * params::degree + nth];
-  }
+  lwe_array_out[glwe_dimension * params::degree] =
+      glwe[glwe_dimension * params::degree + nth];
 }

 // Extracts the mask from the nth-LWE in a GLWE.
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -8,6 +8,38 @@ bool p2p_enabled = false;
 const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS = 12;
 const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS = 68;

+// Enable bidirectional p2p access between all available GPUs and device_0_id
+int32_t cuda_setup_multi_gpu(int device_0_id) {
+  int num_gpus = cuda_get_number_of_gpus();
+  if (num_gpus == 0)
+    PANIC("GPU error: the number of GPUs should be > 0.")
+  int num_used_gpus = 1;
+  if (num_gpus > 1) {
+    m.lock();
+    if (!p2p_enabled) {
+      p2p_enabled = true;
+      omp_set_nested(1);
+      int has_peer_access_to_device_0;
+      for (int i = 1; i < num_gpus; i++) {
+        check_cuda_error(cudaDeviceCanAccessPeer(&has_peer_access_to_device_0,
+                                                 i, device_0_id));
+        if (has_peer_access_to_device_0) {
+          cuda_set_device(i);
+          check_cuda_error(cudaDeviceEnablePeerAccess(device_0_id, 0));
+          cuda_set_device(device_0_id);
+          check_cuda_error(cudaDeviceEnablePeerAccess(i, 0));
+        }
+        num_used_gpus += 1;
+      }
+    } else {
+      for (int i = 1; i < num_gpus; i++)
+        num_used_gpus += 1;
+    }
+    m.unlock();
+  }
+  return (int32_t)(num_used_gpus);
+}
+
 uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                              PBS_TYPE pbs_type) {
  int threshold = (pbs_type == MULTI_BIT)
--- a/Show More
+++ b/Show More