Fix Makefile

Add cost for 8xL40
Add decomp_sns_comp bench to summary
2026-04-28 03:01:21 -04:00 · 2026-02-13 09:49:21 +01:00 · 2026-02-13 09:01:10 +01:00 · 2026-02-12 17:44:24 +01:00 · 2026-02-12 16:34:03 +01:00 · 2026-02-12 15:05:00 +01:00
284 changed files with 15194 additions and 5777 deletions
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -0,0 +1,15 @@
+runners:
+  cpu-big:
+    family: m6i.32xlarge
+    image: cpu-tests-eu-west-3
+    volume: 200gb
+    spot: false
+  cpu-small:
+    family: m6i.4xlarge
+    image: cpu-tests-eu-west-3
+    volume: 200gb
+    spot: false
+
+images:
+  cpu-tests-eu-west-3:
+    ami: "ami-0a786ffdb1411fac4"  # Ubuntu 24.04
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -34,6 +34,9 @@ permissions:
 jobs:
  setup-instance:
    name: aws_tfhe_backward_compat_tests/setup-instance
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name != 'push'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -41,7 +44,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -66,7 +69,7 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'true' # Needed to pull lfs data
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -141,7 +144,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -63,7 +63,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -171,7 +171,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -299,7 +299,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -50,7 +50,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -86,7 +86,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
    timeout-minutes: 1440
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -51,7 +51,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -72,7 +72,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -155,7 +155,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -39,7 +39,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu.yml
+++ b/.github/workflows/benchmark_cpu.yml
@@ -16,10 +16,12 @@ on:
          - integer_zk
          - shortint
          - shortint_oprf
-          - hlapi
+          - hlapi_unsigned
+          - hlapi_signed
          - hlapi_erc20
          - hlapi_dex
          - hlapi_noise_squash
+          - hlapi_kvstore
          - tfhe_zk_pok
          - boolean
          - pbs
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_weekly.yml
+++ b/.github/workflows/benchmark_cpu_weekly.yml
@@ -24,6 +24,7 @@ permissions: {}
 jobs:
  prepare-inputs:
    name: benchmark_cpu_weekly/prepare-inputs
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    outputs:
      is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -72,8 +73,7 @@ jobs:

  run-benchmarks-integer:
    name: benchmark_cpu_weekly/run-benchmarks-integer
-    if: github.repository == 'zama-ai/tfhe-rs' 
-      && (needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -92,8 +92,7 @@ jobs:

  run-benchmarks-integer-zk-pke:
    name: benchmark_cpu_weekly/run-benchmarks-integer-zk-pke
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -111,8 +110,7 @@ jobs:

  run-benchmarks-hlapi-erc20:
    name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -130,8 +128,7 @@ jobs:

  run-benchmarks-hlapi-dex:
    name: benchmark_cpu_weekly/run-benchmarks-hlapi-dex
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -149,8 +146,7 @@ jobs:

  run-benchmarks-core-crypto:
    name: benchmark_cpu_weekly/run-benchmarks-core-crypto
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -167,8 +163,7 @@ jobs:

  run-benchmarks-shortint:
    name: benchmark_cpu_weekly/run-benchmarks-shortint
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && (needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -186,8 +181,7 @@ jobs:

  run-benchmarks-boolean:
    name: benchmark_cpu_weekly/run-benchmarks-boolean
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
@@ -206,8 +200,7 @@ jobs:

  run-benchmarks-tfhe-zk-pok:
    name: benchmark_cpu_weekly/run-benchmarks-tfhe-zk-pok
-    if: github.repository == 'zama-ai/tfhe-rs'
-      && needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_cpu_common.yml
    with:
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_documentation.yml
+++ b/.github/workflows/benchmark_documentation.yml
@@ -25,10 +25,6 @@ on:
        description: "Generate SVG tables"
        type: boolean
        default: true
-      open-pr:
-        description: "Open a PR with the benchmark results"
-        type: boolean
-        default: false

 permissions: {}

@@ -166,54 +162,3 @@ jobs:
      DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
      DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
      DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
-
-  open-pr:
-    name: benchmark-documentation/open-pr
-    needs: [ generate-svgs-with-benchmarks-run, generate-svgs-without-benchmarks-run ]
-    if: ${{ always() && inputs.open-pr &&
-      (needs.generate-svgs-with-benchmarks-run.result == 'success' || needs.generate-svgs-without-benchmarks-run.result == 'success') }}
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write # Needed to create a commit
-      pull-requests: write # Needed to open a pull-request
-    env:
-      PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
-        with:
-          persist-credentials: 'false'
-
-      - name: Download SVG tables
-        uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
-        with:
-          path: svg_tables
-          merge-multiple: 'true'
-
-      # Perform best effort to copy SVG tables. If the copy fails or files don't exist, the PR will still be created.
-      - name: Copy SVG tables to documentation location
-        run: |
-          cp -f svg_tables/*integer-benchmark*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-          cp -f svg_tables/*pbs-benchmark-tuniform*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-          cp -f svg_tables/cpu-gpu-hpu-integer-benchmark-fheuint64-tuniform-2m128-ciphertext.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
-
-      - name: Get current date
-        id: get-date
-        run: |
-          echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"
-
-      - name: Create pull-request
-        uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
-        with:
-          sign-commits: true # Commit will be signed by github-actions bot
-          add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg
-          branch: gh-bot/docs/update-svg-tables-${{ steps.get-date.outputs.date }}
-          commit-message: |
-            chore(docs): update benchmark results for all backends
-
-            Automated documentation update from tfhe-rs CI pipeline.
-          title: |
-            [CI] chore(docs): update benchmark results for all backends
-          body: |
-            Documentation update triggered by GitHub workflow.
-          labels: documentation
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -40,7 +40,7 @@ jobs:
    timeout-minutes: 1440 # 24 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -123,7 +123,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -134,7 +134,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -50,6 +50,8 @@ env:
 jobs:
  parse-inputs:
    name: benchmark_gpu_coprocessor/parse-inputs
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      contents: 'read'
@@ -92,7 +94,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +132,7 @@ jobs:
          git lfs install

      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          path: tfhe-rs
          persist-credentials: false
@@ -141,7 +143,7 @@ jobs:
          ls

      - name: Checkout fhevm
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          repository: zama-ai/fhevm
          persist-credentials: 'false'
@@ -192,7 +194,7 @@ jobs:
          cargo install sqlx-cli

      - name: Install foundry
-        uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730
+        uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10

      - name: Cache cargo
        uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7 # v5.0.2
@@ -299,7 +301,7 @@ jobs:
          path: fhevm/$${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -324,7 +326,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -25,6 +25,7 @@ permissions: {}
 jobs:
  prepare-inputs:
    name: benchmark_cpu_weekly/prepare-inputs
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    outputs:
      is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -49,8 +50,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-integer:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -72,8 +72,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-integer-compression:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-compression
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -95,8 +94,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-integer-zk-aes:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-zk-aes
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -118,8 +116,7 @@ jobs:

  run-benchmarks-8-h100-sxm5-noise-squash:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-noise-squash
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -141,8 +138,7 @@ jobs:

  run-benchmarks-1-h100-core-crypto:
    name: benchmark_gpu_weekly/run-benchmarks-1-h100-core-crypto (1xH100)
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -166,8 +162,7 @@ jobs:

  run-benchmarks-1-h100-erc20:
    name: benchmark_gpu_weekly/run-benchmarks-1-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -187,8 +182,7 @@ jobs:

  run-benchmarks-2-h100-erc20:
    name: benchmark_gpu_weekly/run-benchmarks-2-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -208,8 +202,7 @@ jobs:

  run-benchmarks-8-h100-erc20:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-erc20
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -233,8 +226,7 @@ jobs:

  run-benchmarks-1-h100-dex:
    name: benchmark_gpu_weekly/run-benchmarks-1-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -254,8 +246,7 @@ jobs:

  run-benchmarks-2-h100-dex:
    name: benchmark_gpu_weekly/run-benchmarks-2-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -275,8 +266,7 @@ jobs:

  run-benchmarks-8-h100-dex:
    name: benchmark_gpu_weekly/run-benchmarks-8-h100-dex
-    if: github.repository == 'zama-ai/tfhe-rs' &&
-      needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
+    if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
    needs: prepare-inputs
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
--- a/.github/workflows/benchmark_hpu.yml
+++ b/.github/workflows/benchmark_hpu.yml
@@ -12,7 +12,8 @@ on:
        default: integer
        options:
          - integer
-          - hlapi
+          - hlapi_unsigned
+          - hlapi_signed
          - hlapi_erc20
      op_flavor:
        description: "Operations set to run"
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -126,7 +126,7 @@ jobs:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -50,7 +50,7 @@ jobs:
      pull-requests: write # Needed to write a comment in a pull-request
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0  # Needed to get commit hash
          persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -305,13 +305,13 @@ jobs:
      REF_NAME: ${{ github.head_ref || github.ref_name }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install recent Python
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.12'
          pip-install: -r ci/data_extractor/requirements.txt -r ci/perf_regression/requirements.txt
@@ -383,7 +383,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_summary.yml
+++ b/.github/workflows/benchmark_summary.yml
@@ -0,0 +1,126 @@
+# Run all benchmarks displayed in the internal documentation.
+name: benchmark_summary
+
+run-name: Benchmark Summary
+
+on:
+  workflow_dispatch:
+    inputs:
+      run-cpu-benchmarks:
+        description: "Run CPU benchmarks"
+        type: boolean
+        default: true
+      run-gpu-benchmarks:
+        description: "Run GPU benchmarks"
+        type: boolean
+        default: true
+      gpu-profile:
+        description: "GPU Instance type"
+        required: true
+        default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100-SXM5x8)"
+      run-hpu-benchmarks:
+        description: "Run HPU benchmarks"
+        type: boolean
+        default: true
+
+
+permissions: {}
+
+# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
+
+jobs:
+  parse-gpu-inputs:
+    name: benchmark_summary/parse-gpu-inputs
+    if: inputs.run-gpu-benchmarks
+    runs-on: ubuntu-latest
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.gpu-profile }}
+    steps:
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
+
+  run-benchmarks-cpu:
+    name: benchmark_documentation/run-benchmarks-cpu-integer
+    uses: ./.github/workflows/benchmark_cpu_common.yml
+    if: inputs.run-cpu-benchmarks
+    with:
+      command: summary
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-gpu:
+    name: benchmark_documentation/run-benchmarks-gpu
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-benchmarks
+    needs: parse-gpu-inputs
+    with:
+      profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
+      command: summary
+      bench_type: both
+      params_type: classical + multi_bit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+# TODO add make recipe for HPU benchmarks
+#  run-benchmarks-hpu:
+#    name: benchmark_documentation/run-benchmarks-hpu
+#    uses: ./.github/workflows/benchmark_hpu_common.yml
+#    if: inputs.run-hpu-benchmarks
+#    with:
+#      command: summary
+#      bench_type: both
+#      v80_pcie_dev: 24
+#      v80_serial_number: XFL12NWY3ZKG
+#    secrets:
+#      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+#      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+#      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+#      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+#      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+#      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+#      SLAB_URL: ${{ secrets.SLAB_URL }}
+#      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+#      SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -31,13 +31,16 @@ permissions: {}
 jobs:
  setup-instance:
    name: benchmark_tfhe_fft/setup-instance
+    if:
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -31,13 +31,16 @@ permissions: {}
 jobs:
  setup-instance:
    name: benchmark_tfhe_ntt/setup-instance
+    if:
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -102,7 +105,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -134,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -31,15 +31,14 @@ jobs:
    name: benchmark_wasm_client/should-run
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
+      (github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
      pull-requests: read  # Needed to check for file change
    outputs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -71,7 +70,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +90,7 @@ jobs:
        browser: [ chrome, firefox ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -181,7 +180,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: zama-ai/slab
          path: slab
@@ -213,7 +212,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -24,9 +24,11 @@ permissions: {}
 jobs:
  audit:
    name: cargo_audit/audit
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -24,7 +24,7 @@ jobs:
    outputs:
      matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -140,7 +140,7 @@ jobs:
      result: ${{ steps.set_builds_result.outputs.result }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -26,7 +26,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -24,7 +24,7 @@ jobs:
        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -2,6 +2,7 @@
 name: cargo_test_fft

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -22,6 +23,8 @@ permissions:
 jobs:
  should-run:
    name: cargo_test_fft/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -29,7 +32,7 @@ jobs:
      fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -56,7 +59,7 @@ jobs:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +95,7 @@ jobs:
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -2,6 +2,7 @@
 name: cargo_test_ntt

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -24,6 +25,8 @@ permissions:
 jobs:
  should-run:
    name: cargo_test_ntt/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -31,7 +34,7 @@ jobs:
      ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -60,7 +63,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,7 +90,7 @@ jobs:
        os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
      fail-fast: false
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -143,7 +146,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -43,7 +43,7 @@ jobs:
          echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"

      - name: Check workflows security
-        uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
+        uses: zizmorcore/zizmor-action@135698455da5c3b3e55f73f4419e481ab68cdd95 # v0.4.1
        with:
          advanced-security: 'false' # Print results directly in logs
          persona: pedantic
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -31,7 +31,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -37,7 +37,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/generate_svg_common.yml
+++ b/.github/workflows/generate_svg_common.yml
@@ -43,7 +43,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'

--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -19,8 +19,8 @@ on:
  pull_request:
    types: [ labeled ]
  schedule:
-    # Nightly tests @ 1AM after each work day
-    - cron: "0 1 * * MON-FRI"
+   # Every other day at 1AM
+   - cron: "0 1 */2 * *"

 permissions:
  contents: read
@@ -37,11 +37,11 @@ jobs:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
    runs-on: ["self-hosted", "4090-desktop"]
-    timeout-minutes: 1440 # 24 hours
+    timeout-minutes: 2880 # 48 hours

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -23,8 +23,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # every month
-    - cron: "0 0 1 * *"
+    # every friday noon
+    - cron: "0 12 * * 5"

 permissions:
  contents: read
@@ -35,15 +35,15 @@ jobs:
  setup-instance:
    name: gpu_code_validation_tests/setup-instance
    runs-on: ubuntu-latest
-    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved')
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_core_h100_tests.yml
+++ b/.github/workflows/gpu_core_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: gpu_fast_h100_tests
+name: gpu_core_h100_tests

 env:
  CARGO_TERM_COLOR: always
@@ -32,7 +32,7 @@ permissions:

 jobs:
  should-run:
-    name: gpu_fast_h100_tests/should-run
+    name: gpu_core_h100_tests/should-run
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -61,15 +61,14 @@ jobs:
              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
              - tfhe/src/shortint/parameters/**
-              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**/**.md'
-              - '.github/workflows/gpu_fast_h100_tests.yml'
+              - '.github/workflows/gpu_core_h100_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml

  setup-instance:
-    name: gpu_fast_h100_tests/setup-instance
+    name: gpu_core_h100_tests/setup-instance
    needs: should-run
    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -87,7 +86,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,7 +110,7 @@ jobs:
          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
-    name: gpu_fast_h100_tests/cuda-tests-linux
+    name: gpu_core_h100_tests/cuda-tests-linux
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -129,7 +128,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -155,20 +154,8 @@ jobs:
          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

-      - name: Run user docs tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
-
-      - name: Run High Level API Tests
-        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
-
  slack-notify:
-    name: gpu_fast_h100_tests/slack-notify
+    name: gpu_core_h100_tests/slack-notify
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -187,10 +174,10 @@ jobs:
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+          SLACK_MESSAGE: "Core H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"

  teardown-instance:
-    name: gpu_fast_h100_tests/teardown-instance
+    name: gpu_core_h100_tests/teardown-instance
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
@@ -198,7 +185,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -39,7 +39,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -79,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -151,7 +151,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          make test_high_level_api_gpu_fast

  slack-notify:
    name: gpu_fast_tests/slack-notify
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,7 +68,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -154,7 +154,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          make test_high_level_api_gpu
+          make test_high_level_api_gpu_fast

  slack-notify:
    name: gpu_full_multi_gpu_tests/slack-notify
@@ -187,7 +187,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_hlapi_h100_tests.yml
+++ b/.github/workflows/gpu_hlapi_h100_tests.yml
@@ -0,0 +1,209 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: gpu_hlapi_h100_tests
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]
+
+permissions:
+  contents: read
+
+# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
+
+jobs:
+  should-run:
+    name: gpu_hlapi_h100_tests/should-run
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read  # Needed to check for file change
+    outputs:
+      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
+        with:
+          files_yaml: |
+            gpu:
+              - tfhe/Cargo.toml
+              - tfhe/build.rs
+              - backends/tfhe-cuda-backend/**
+              - tfhe/src/core_crypto/gpu/**
+              - tfhe/src/integer/gpu/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_signed/**
+              - tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
+              - tfhe/src/shortint/parameters/**
+              - tfhe/src/high_level_api/**
+              - tfhe/src/c_api/**
+              - 'tfhe/docs/**/**.md'
+              - '.github/workflows/gpu_hlapi_h100_tests.yml'
+              - scripts/integer-tests.sh
+              - ci/slab.toml
+
+  setup-instance:
+    name: gpu_hlapi_h100_tests/setup-instance
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
+      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
+    runs-on: ubuntu-latest
+    outputs:
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cuda-tests-linux:
+    name: gpu_hlapi_h100_tests/cuda-tests-linux
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.8"
+            gcc: 11 
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
+      
+      - name: Run user docs tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
+
+  slack-notify:
+    name: gpu_hlapi_h100_tests/slack-notify
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Set pull-request URL
+        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Send message
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "HL API H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: gpu_hlapi_h100_tests/teardown-instance
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -65,7 +65,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -74,7 +74,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -40,7 +40,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -2,6 +2,7 @@
 name: hpu_hlapi_tests

 on:
+  workflow_dispatch:
  pull_request:
  push:
    branches:
@@ -25,6 +26,8 @@ permissions: {}
 jobs:
  should-run:
    name: hpu_hlapi_tests/should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    runs-on: ubuntu-latest
    permissions:
      pull-requests: read  # Needed to check for file change
@@ -32,7 +35,7 @@ jobs:
      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -62,7 +65,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,7 +86,7 @@ jobs:
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -114,7 +117,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -34,7 +34,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,7 +53,7 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -41,7 +41,7 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/make_release_common.yml
+++ b/.github/workflows/make_release_common.yml
@@ -52,7 +52,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -75,6 +75,7 @@ jobs:
    name: make_release_common/provenance
    if: ${{ !inputs.dry-run  }}
    needs: package
+    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
      actions: read # Needed to detect the GitHub Actions environment
@@ -93,7 +94,7 @@ jobs:
      id-token: write # Needed for OIDC token exchange on crates.io
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: "false"
@@ -117,6 +117,7 @@ jobs:
    name: make_release_cuda/provenance
    if: ${{ !inputs.dry_run  }}
    needs: [package]
+    # This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
    permissions:
      actions: read # Needed to detect the GitHub Actions environment
@@ -221,7 +222,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -68,7 +68,7 @@ jobs:
      id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -92,7 +92,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
+        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
@@ -109,7 +109,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
+        uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
        with:
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -30,7 +30,7 @@ jobs:
    name: parameters_check/setup-instance
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name == 'workflow_dispatch'
+      github.event_name != 'push'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
          toolchain: stable

      - name: Checkout lattice-estimator
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/placeholder_workflow.yml
+++ b/.github/workflows/placeholder_workflow.yml
@@ -1,18 +1,127 @@
 # Placeholder workflow file allowing running it without having to merge to main first
 name: placeholder_workflow

+run-name: Summary benchs tests
+
 on:
  workflow_dispatch:
+    inputs:
+      run-cpu-benchmarks:
+        description: "Run CPU benchmarks"
+        type: boolean
+        default: true
+      run-gpu-benchmarks:
+        description: "Run GPU benchmarks"
+        type: boolean
+        default: true
+      gpu-profile:
+        description: "GPU Instance type"
+        required: true
+        default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "8-l40 (n3-L40x8)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100-SXM5x8)"
+      run-hpu-benchmarks:
+        description: "Run HPU benchmarks"
+        type: boolean
+        default: true
+

 permissions: {}

 # zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow

 jobs:
-  placeholder:
-    name: placeholder_workflow/placeholder
+  parse-gpu-inputs:
+    name: benchmark_summary/parse-gpu-inputs
+    if: inputs.run-gpu-benchmarks
    runs-on: ubuntu-latest
-
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.gpu-profile }}
    steps:
-      - run: |
-          echo "Hello this is a Placeholder Workflow"
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
+
+  run-benchmarks-cpu:
+    name: benchmark_documentation/run-benchmarks-cpu-integer
+    uses: ./.github/workflows/benchmark_cpu_common.yml
+    if: inputs.run-cpu-benchmarks
+    with:
+      command: summary
+      bench_type: both
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-gpu:
+    name: benchmark_documentation/run-benchmarks-gpu
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    if: inputs.run-gpu-benchmarks
+    needs: parse-gpu-inputs
+    with:
+      profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
+      command: summary
+      bench_type: both
+      params_type: classical + multi_bit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+# TODO add make recipe for HPU benchmarks
+#  run-benchmarks-hpu:
+#    name: benchmark_documentation/run-benchmarks-hpu
+#    uses: ./.github/workflows/benchmark_hpu_common.yml
+#    if: inputs.run-hpu-benchmarks
+#    with:
+#      command: summary
+#      bench_type: both
+#      v80_pcie_dev: 24
+#      v80_serial_number: XFL12NWY3ZKG
+#    secrets:
+#      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+#      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+#      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+#      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+#      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+#      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+#      SLAB_URL: ${{ secrets.SLAB_URL }}
+#      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+#      SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
--- a/.github/workflows/pr_milestone_check.yml
+++ b/.github/workflows/pr_milestone_check.yml
@@ -1,67 +0,0 @@
-name: pr_milestone_check
-
-on:
-  pull_request:
-    types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-# external contributors workflows are manually approved
-
-jobs:
-  check-empty-milestone:
-    name: pr_milestone_check/check-empty-milestone
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone == null
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone Missing
-
-            Please assign a milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is missing. This check is failing."
-          exit 1
-
-  check-milestone-open:
-    name: pr_milestone_check/check-milestone-open
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone is closed
-
-            Please assign an open milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is closed. This check is failing."
-          exit 1
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -30,7 +30,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
+          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -47,6 +47,8 @@ jobs:

          echo ">>> Pushing all LFS items..."
          git lfs push --all destination "${DESTINATION_BRANCH}"
+          
+          shred --remove .git/config

      - name: git-sync-tags
        env:
@@ -59,7 +61,7 @@ jobs:
        run: |
          echo ">>> Cloning source repo..."
          git lfs install
-          git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
+          git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
          git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"

          echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -70,3 +72,5 @@ jobs:

          echo ">>> Pushing git changes..."
          git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
+          
+          shred --remove .git/config
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -12,6 +12,7 @@ permissions: {}
 jobs:
  stale:
    name: unverified_prs/stale
+    if: github.repository == 'zama-ai/tfhe-rs'
    runs-on: ubuntu-latest
    permissions:
      issues: read # Needed to fetch all issues
--- a/161
+++ b/161
@@ -733,11 +733,12 @@ test_core_crypto_gpu:
 		--features=gpu -p tfhe -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
-	RUSTFLAGS="$(RUSTFLAGS)" cargo test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=4
+test_integer_gpu: install_cargo_nextest
+	TEST_THREADS=2 \
+	DOCTEST_THREADS=4 \
+		./scripts/integer-tests.sh \
+		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
+		--tfhe-package "tfhe" --all-but-noise

 .PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
 test_integer_gpu_debug:
@@ -1049,10 +1050,16 @@ test_high_level_api:
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::

-test_high_level_api_gpu: install_cargo_nextest
+test_high_level_api_gpu_fast: install_cargo_nextest # Run all the GPU tests for high_level_api except test_uniformity for oprf which is too long
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-		-E "test(/high_level_api::.*gpu.*/)"
+	  -E "test(/high_level_api::.*gpu.*/) and not test(/uniformity/)"
+
+
+test_high_level_api_gpu: install_cargo_nextest # Run all the GPU tests for high_level_api
+	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
+		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
+  	-E "test(/high_level_api::.*gpu.*/)"

 test_list_gpu: install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo nextest list --cargo-profile $(CARGO_PROFILE) \
@@ -1371,6 +1378,9 @@ clippy_bench: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+	  --features=shortint,internal-keycache \
+		-p tfhe-benchmark -- --no-deps -D warnings

 .PHONY: clippy_bench_gpu # Run clippy lints on tfhe-benchmark
 clippy_bench_gpu: install_rs_check_toolchain
@@ -1405,14 +1415,14 @@ bench_signed_integer: install_rs_check_toolchain

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_signed_integer_gpu # Run benchmarks for signed integer on GPU backend
 bench_signed_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1428,14 +1438,14 @@ bench_integer_hpu: install_rs_check_toolchain

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-glwe_packing_compression \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1449,7 +1459,8 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
 	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
@@ -1475,6 +1486,13 @@ bench_integer_trivium_gpu: install_rs_check_toolchain
 	--bench integer-trivium \
 	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

+.PHONY: bench_integer_kreyvium_gpu # Run benchmarks for kreyvium on GPU backend
+bench_integer_kreyvium_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-kreyvium \
+	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1509,7 +1527,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-zk-pke \
 	--features=integer,internal-keycache,zk-pok,pbs-stats \
@@ -1655,11 +1673,18 @@ bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_unsafe_coop_firefox

-.PHONY: bench_hlapi # Run benchmarks for integer operations
-bench_hlapi: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
+.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
+bench_hlapi_unsigned: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi \
+	--bench hlapi_unsigned \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_signed # Run benchmarks for signed integer operations
+bench_hlapi_signed: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi_signed \
 	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
@@ -1749,6 +1774,108 @@ bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
 	--bench hlapi-noise-squash \
 	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --

+.PHONY: bench_hlapi_kvstore # Run benchmarks for Key-Value Store operations
+bench_hlapi_kvstore: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-kvstore \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_summary # Run summary benchmarks
+bench_summary: install_rs_check_toolchain
+	# Arithmetic operations: addition, multiplication, division, comparison
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi_unsigned \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::add|::mul|::gt|::div_rem'
+
+	# Noise squash
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::noise_squash::'
+
+	# ERC20
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
+
+	# DEX
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::no_cmux::'
+
+	# ZK
+	# Proof is done on CPU node of the instance
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,zk-pok,pbs-stats \
+	-p tfhe-benchmark -- '::pke_zk_proof'
+	# Verify is done on GPUs
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
+
+	# Compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-glwe_packing_compression \
+	--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
+
+.PHONY: bench_summary_gpu # Run summary benchmarks on GPU
+bench_summary_gpu: install_rs_check_toolchain
+	# Arithmetic operations: addition, multiplication, division, comparison
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=FAST_DEFAULT __TFHE_RS_BENCH_BIT_SIZES_SET=FAST __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::add|::mul|::gt|::div_rem'
+
+	# Noise squash
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::noise_squash::'
+
+	# Noise squash and compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
+
+	# ERC20
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
+
+	# DEX
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE)  __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::no_cmux::'
+
+	# ZK
+	# Proof is done on CPU node of the instance
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,zk-pok,pbs-stats \
+	-p tfhe-benchmark -- '::pke_zk_proof'
+	# Verify is done on GPUs
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-zk-pke \
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --
+
+	# Compression
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-glwe_packing_compression \
+	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --

 .PHONY: bench_custom # Run benchmarks with a user-defined command
 bench_custom: install_rs_check_toolchain
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -87,6 +87,7 @@ fn main() {
            "cuda/include/integer/rerand.h",
            "cuda/include/aes/aes.h",
            "cuda/include/trivium/trivium.h",
+            "cuda/include/kreyvium/kreyvium.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -29,15 +29,13 @@ template <typename Torus> struct int_aes_lut_buffers {
        allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return a & b; };
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
-        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, and_lambda, allocate_gpu_memory);
+
    auto active_streams_and_lut = streams.active_gpu_subset(
        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
        params.pbs_type);
-    this->and_lut->broadcast_lut(active_streams_and_lut);
+    this->and_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_and_lut, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
+
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->flush_lut = new int_radix_lut<Torus>(
@@ -46,14 +44,11 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
      return x & 1;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
-        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+
    auto active_streams_flush_lut = streams.active_gpu_subset(
        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
-    this->flush_lut->broadcast_lut(active_streams_flush_lut);
+    this->flush_lut->generate_and_broadcast_lut(
+        active_streams_flush_lut, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->carry_lut = new int_radix_lut<Torus>(
@@ -61,14 +56,11 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
      return (x >> 1) & 1;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
-        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_lambda, allocate_gpu_memory);
+
    auto active_streams_carry_lut =
        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
-    this->carry_lut->broadcast_lut(active_streams_carry_lut);
+    this->carry_lut->generate_and_broadcast_lut(
+        active_streams_carry_lut, {0}, {carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -10,11 +10,7 @@ extern std::mutex m;
 extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
 extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
-
-extern "C" {
-int32_t cuda_setup_multi_gpu(int device_0_id);
-}
-
+extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
 // Define a variant type that can be either a vector or a single pointer
 template <typename Torus>
 using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,

 uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                              PBS_TYPE pbs_type);
+uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
+                                   PBS_TYPE pbs_type);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -80,7 +78,15 @@ public:
        _streams, _gpu_indexes,
        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
  }
-
+  // Returns a subset of this set as an active subset for pbs128. An active
+  // subset is one that is temporarily used to perform some computation. For
+  // pbs128, the threshold is different, because the original threshold was
+  // designed for 2_2 params.
+  CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
+    return CudaStreams(
+        _streams, _gpu_indexes,
+        get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
+  }
  // Returns a CudaStreams struct containing only the ith stream
  CudaStreams get_ith(int i) const {
    return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -144,9 +150,9 @@ public:
        _gpu_count(src._gpu_count), _owns_streams(false) {}

  CudaStreams &operator=(CudaStreams const &other) {
-    PANIC_IF_FALSE(this->_streams == nullptr ||
-                       this->_streams == other._streams,
-                   "Assigning an already initialized CudaStreams");
+    /*    PANIC_IF_FALSE(this->_streams == nullptr ||
+                           this->_streams == other._streams,
+                       "Assigning an already initialized CudaStreams");*/
    this->_streams = other._streams;
    this->_gpu_indexes = other._gpu_indexes;
    this->_gpu_count = other._gpu_count;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -45,12 +45,9 @@ template <typename Torus> struct boolean_bitop_buffer {

        // BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
        // only lut for degree = 1 is generated
-        generate_device_accumulator_bivariate_with_factor<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+        lut->generate_and_broadcast_bivariate_lut(active_streams, {0},
+                                                  {lut_bivariate_f},
+                                                  LUT_0_FOR_ALL_BLOCKS, {}, 2);
      }
      break;
    default:
@@ -65,14 +62,8 @@ template <typename Torus> struct boolean_bitop_buffer {
        return x % params.message_modulus;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          message_extract_lut->get_lut(0, 0),
-          message_extract_lut->get_degree(0),
-          message_extract_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_message_extract, gpu_memory_allocated);
-      message_extract_lut->broadcast_lut(active_streams);
+      message_extract_lut->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }
    tmp_lwe_left = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -142,12 +133,8 @@ template <typename Torus> struct int_bitop_buffer {
          }
        };

-        generate_device_accumulator_bivariate<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+        lut->generate_and_broadcast_bivariate_lut(
+            active_streams, {0}, {lut_bivariate_f}, LUT_0_FOR_ALL_BLOCKS);
      }
      break;
    default:
@@ -156,6 +143,8 @@ template <typename Torus> struct int_bitop_buffer {
                                     num_radix_blocks, allocate_gpu_memory,
                                     size_tracker);

+      std::vector<std::function<Torus(Torus)>> lut_funcs;
+      std::vector<uint32_t> lut_indices;
      for (int i = 0; i < params.message_modulus; i++) {
        auto rhs = i;

@@ -171,14 +160,13 @@ template <typename Torus> struct int_bitop_buffer {
            return x ^ rhs;
          }
        };
-        generate_device_accumulator<Torus>(
-            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
-            lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_univariate_scalar_f,
-            gpu_memory_allocated);
-        lut->broadcast_lut(active_streams);
+
+        lut_funcs.push_back(lut_univariate_scalar_f);
+        lut_indices.push_back(i);
      }
+
+      lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
+                                      LUT_0_FOR_ALL_BLOCKS);
    }
  }

@@ -211,16 +199,11 @@ template <typename Torus> struct boolean_bitnot_buffer {
        return x % message_modulus;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          message_extract_lut->get_lut(0, 0),
-          message_extract_lut->get_degree(0),
-          message_extract_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_message_extract, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
-      message_extract_lut->broadcast_lut(active_streams);
+
+      message_extract_lut->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -28,20 +28,16 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
      uint32_t bits_per_block = std::log2(params.message_modulus);
      uint32_t msg_modulus = params.message_modulus;

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          [msg_modulus, bits_per_block](Torus x) {
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+
+      lut->generate_and_broadcast_lut(
+          active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
            const auto xm = x % msg_modulus;
            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
            return (Torus)((msg_modulus - 1) * sign_bit);
-          },
-          allocate_gpu_memory);
-
-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      lut->broadcast_lut(active_streams);
+          }},
+          LUT_0_FOR_ALL_BLOCKS);

      this->last_block = new CudaRadixCiphertextFFI;

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -85,42 +85,28 @@ template <typename Torus> struct int_cmux_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
-        predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
-        predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
-        message_extract_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        message_extract_lut_f, gpu_memory_allocated);
-    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
-    for (int index = 0; index < 2 * num_radix_blocks; index++) {
-      if (index < num_radix_blocks) {
-        h_lut_indexes[index] = 0;
-      } else {
-        h_lut_indexes[index] = 1;
-      }
-    }
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
-        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams_pred =
        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
-    predicate_lut->broadcast_lut(active_streams_pred);
+    auto lut_index_generator = [num_radix_blocks](Torus *h_lut_indexes,
+                                                  uint32_t num_indexes) {
+      for (int index = 0; index < 2 * num_radix_blocks; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+    };
+
+    predicate_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
+        lut_index_generator);
+
    auto active_streams_msg =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    message_extract_lut->broadcast_lut(active_streams_msg);
+
+    message_extract_lut->generate_and_broadcast_lut(
+        active_streams_msg, {0}, {message_extract_lut_f}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
    Torus total_modulus = params.message_modulus * params.carry_modulus;
    uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

-    int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
+    int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
    tmp_out = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
@@ -39,22 +39,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

-    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
-                                            allocate_gpu_memory, size_tracker);
-    auto is_max_value_f = [max_value](Torus x) -> Torus {
-      return x == max_value;
-    };
    preallocated_h_lut = (Torus *)malloc(
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
-        is_max_value->get_degree(0), is_max_value->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_max_value_f, gpu_memory_allocated);
+
+    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
+                                            allocate_gpu_memory, size_tracker);

    auto active_streams =
        streams.active_gpu_subset(max_chunks, params.pbs_type);
-    is_max_value->broadcast_lut(active_streams);
+
+    auto is_max_value_f = [max_value](Torus x) -> Torus {
+      return x == max_value;
+    };
+
+    is_max_value->generate_and_broadcast_lut(
+        active_streams, {0}, {is_max_value_f}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
@@ -103,15 +102,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
-        is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
-
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    is_non_zero_lut->broadcast_lut(active_streams);
+    is_non_zero_lut->generate_and_broadcast_lut(
+        active_streams, {0}, {is_non_zero_lut_f}, LUT_0_FOR_ALL_BLOCKS);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -129,32 +123,27 @@ template <typename Torus> struct int_comparison_eq_buffer {
        return (lhs == rhs);
      }
    };
+
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < total_modulus; i++) {
      auto lut_f = [i, operator_f](Torus x) -> Torus {
        return operator_f(i, x);
      };
-
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          scalar_comparison_luts->get_lut(0, i),
-          scalar_comparison_luts->get_degree(i),
-          scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f, gpu_memory_allocated);
+      lut_funcs.push_back(lut_f);
+      lut_indices.push_back(i);
    }
-    scalar_comparison_luts->broadcast_lut(active_streams);
+
+    scalar_comparison_luts->generate_and_broadcast_lut(
+        active_streams, lut_indices, lut_funcs, LUT_0_FOR_ALL_BLOCKS);
+
    if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
      operator_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
-          operator_lut->get_degree(0), operator_lut->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, operator_f, gpu_memory_allocated);
-
-      operator_lut->broadcast_lut(active_streams);
+      operator_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {operator_f}, LUT_0_FOR_ALL_BLOCKS);
    } else {
      operator_lut = nullptr;
    }
@@ -221,9 +210,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    // LUTs
-    tree_inner_leaf_lut =
-        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
-                                 allocate_gpu_memory, size_tracker);

    tree_last_leaf_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -234,15 +220,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
-        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        block_selector_f, gpu_memory_allocated);
+    tree_inner_leaf_lut =
+        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
+                                 allocate_gpu_memory, size_tracker);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    tree_inner_leaf_lut->broadcast_lut(active_streams);
+    tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {block_selector_f}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
@@ -426,12 +411,8 @@ template <typename Torus> struct int_comparison_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
-        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
-    identity_lut->broadcast_lut(active_streams);
+    identity_lut->generate_and_broadcast_lut(
+        active_streams, {0}, {identity_lut_f}, LUT_0_FOR_ALL_BLOCKS);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -441,13 +422,8 @@ template <typename Torus> struct int_comparison_buffer {
    is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                           allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
-        is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, is_zero_f, gpu_memory_allocated);
-
-    is_zero_lut->broadcast_lut(active_streams);
+    is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
+                                            LUT_0_FOR_ALL_BLOCKS);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -522,13 +498,9 @@ template <typename Torus> struct int_comparison_buffer {
        PANIC("Cuda error: sign_lut creation failed due to wrong function.")
      };

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
-          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      signed_lut->broadcast_lut(active_streams);
+      signed_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {signed_lut_f}, LUT_0_FOR_ALL_BLOCKS);
    }
    preallocated_h_lut = (Torus *)malloc(
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -11,16 +11,26 @@ template <typename Torus> struct int_compression {
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
  uint32_t lwe_per_glwe;
+  uint32_t max_num_glwes;

+  // num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
+  // compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
+  // defined by the chosen parameter set
  int_compression(CudaStreams streams, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;

+    // Calculate the actual number of GLWEs needed based on total radix blocks.
+    // This ensures we allocate enough memory when num_radix_blocks >
+    // lwe_per_glwe.
+    max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
+
    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
            sizeof(Torus),
@@ -28,7 +38,7 @@ template <typename Torus> struct int_compression {
        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
+            max_num_glwes * glwe_accumulator_size * sizeof(Torus),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory));

@@ -106,19 +116,13 @@ template <typename Torus> struct int_decompression {
          encryption_params.carry_modulus;
      auto effective_compression_carry_modulus = 1;

-      generate_device_accumulator_with_encoding<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          decompression_rescale_lut->get_lut(0, 0),
-          decompression_rescale_lut->get_degree(0),
-          decompression_rescale_lut->get_max_degree(0),
-          encryption_params.glwe_dimension, encryption_params.polynomial_size,
-          effective_compression_message_modulus,
-          effective_compression_carry_modulus,
-          encryption_params.message_modulus, encryption_params.carry_modulus,
-          decompression_rescale_f, gpu_memory_allocated);
      auto active_streams = streams.active_gpu_subset(
          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
-      decompression_rescale_lut->broadcast_lut(active_streams);
+      decompression_rescale_lut->generate_and_broadcast_lut_with_encoding(
+          active_streams, {0}, {decompression_rescale_f},
+          effective_compression_message_modulus,
+          effective_compression_carry_modulus,
+          encryption_params.message_modulus, encryption_params.carry_modulus);
    }
  }
  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -283,12 +283,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                     zero_out_if_not_1_lut_2};
    size_t lut_gpu_indexes[2] = {0, 3};
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(lut_gpu_indexes[j]),
-          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
+      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
+                                          {0}, {zero_out_if_not_1_lut_f},
+                                          LUT_0_FOR_ALL_BLOCKS);
    }

    luts[0] = zero_out_if_not_2_lut_1;
@@ -296,12 +293,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    lut_gpu_indexes[0] = 1;
    lut_gpu_indexes[1] = 2;
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(lut_gpu_indexes[j]),
-          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
+      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
+                                          {0}, {zero_out_if_not_2_lut_f},
+                                          LUT_0_FOR_ALL_BLOCKS);
    }

    quotient_lut_1 =
@@ -321,21 +315,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    };
    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };

-    generate_device_accumulator<Torus>(
-        streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
-        quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
-        quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
-        quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
+    quotient_lut_1->generate_and_broadcast_lut(
+        streams.get_ith(2), {0}, {quotient_lut_1_f}, LUT_0_FOR_ALL_BLOCKS);
+    quotient_lut_2->generate_and_broadcast_lut(
+        streams.get_ith(1), {0}, {quotient_lut_2_f}, LUT_0_FOR_ALL_BLOCKS);
+    quotient_lut_3->generate_and_broadcast_lut(
+        streams.get_ith(0), {0}, {quotient_lut_3_f}, LUT_0_FOR_ALL_BLOCKS);

    message_extract_lut_1 = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -350,15 +335,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    luts[0] = message_extract_lut_1;
    luts[1] = message_extract_lut_2;

+    auto active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);
+
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      auto active_streams =
-          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      luts[j]->broadcast_lut(active_streams);
+      luts[j]->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }
  }

@@ -503,29 +485,35 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
          (Torus *)cuda_malloc_with_size_tracking_async(
              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
              size_tracker, allocate_gpu_memory);
-      for (int index = 0; index < nb; index++) {
-        uint32_t grouping_index = index / group_size;
-        bool is_in_first_grouping = (grouping_index == 0);
-        uint32_t index_in_grouping = index % group_size;
-        bool is_last_index = (index == (nb - 1));
-        if (is_last_index) {
-          if (nb == 1) {
-            h_lut_indexes[index] = 2 * group_size;
+
+      auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+          bool is_last_index = (index == (nb - 1));
+          if (is_last_index) {
+            if (nb == 1) {
+              h_lut_indexes[index] = 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2;
+            }
+          } else if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
          } else {
-            h_lut_indexes[index] = 2;
+            h_lut_indexes[index] = index_in_grouping + group_size;
          }
-        } else if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
        }
-      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          first_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-          allocate_gpu_memory);
+      };
+
+      generate_lut_indexes<Torus>(streams, index_generator,
+                                  first_indexes_for_overflow_sub_gpu_0[nb - 1],
+                                  nb, 2 * group_size + 1, h_lut_indexes,
+                                  allocate_gpu_memory);
    }
    // Extra indexes for the luts in second step
+    uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
+    uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
    for (int nb = 1; nb <= num_blocks; nb++) {
      second_indexes_for_overflow_sub_gpu_0[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
@@ -536,24 +524,37 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
              size_tracker, allocate_gpu_memory);

+      auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
+                                                       uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+
+          if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
+          } else if (index_in_grouping == (group_size - 1)) {
+            if (use_seq) {
+              int inner_index = (grouping_index - 1) % (group_size - 1);
+              h_lut_indexes[index] = inner_index + 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2 * group_size;
+            }
+          } else {
+            h_lut_indexes[index] = index_in_grouping + group_size;
+          }
+        }
+      };
+
+      generate_lut_indexes<Torus>(streams, index_generator,
+                                  second_indexes_for_overflow_sub_gpu_0[nb - 1],
+                                  nb, num_luts_second_step, h_lut_indexes,
+                                  allocate_gpu_memory);
+
      for (int index = 0; index < nb; index++) {
        uint32_t grouping_index = index / group_size;
        bool is_in_first_grouping = (grouping_index == 0);
        uint32_t index_in_grouping = index % group_size;
-
-        if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else if (index_in_grouping == (group_size - 1)) {
-          if (use_seq) {
-            int inner_index = (grouping_index - 1) % (group_size - 1);
-            h_lut_indexes[index] = inner_index + 2 * group_size;
-          } else {
-            h_lut_indexes[index] = 2 * group_size;
-          }
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
-        }
-
        bool may_have_its_padding_bit_set =
            !is_in_first_grouping && (index_in_grouping == group_size - 1);

@@ -567,10 +568,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
          h_scalar[index] = 0;
        }
      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          second_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-          allocate_gpu_memory);
      cuda_memcpy_with_size_tracking_async_to_gpu(
          scalars_for_overflow_sub_gpu_0[nb - 1], h_scalar, nb * sizeof(Torus),
          streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
@@ -1007,24 +1004,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      masking_luts_2[i] = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
-          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
-      masking_luts_1[i]->broadcast_lut(active_streams_1);
+      masking_luts_1[i]->generate_and_broadcast_lut(
+          active_streams_1, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
-          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          lut_f_masking, gpu_memory_allocated);
      auto active_streams_2 =
          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      masking_luts_2[i]->broadcast_lut(active_streams_2);
+      masking_luts_2[i]->generate_and_broadcast_lut(
+          active_streams_2, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1042,15 +1029,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
+
    auto active_streams =
        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
-          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      luts[j]->broadcast_lut(active_streams);
+      luts[j]->generate_and_broadcast_lut(
+          active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
    }

    // Give name to closures to improve readability
@@ -1076,24 +1060,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_did_not_happen[0]->get_lut(0, 0),
-        zero_out_if_overflow_did_not_happen[0]->get_degree(0),
-        zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, params.message_modulus - 2,
-        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(active_streams);
-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
-        zero_out_if_overflow_did_not_happen[1]->get_degree(0),
-        zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, params.message_modulus - 1,
-        gpu_memory_allocated);
-    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(active_streams);
+    zero_out_if_overflow_did_not_happen[0]
+        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
+                                               LUT_0_FOR_ALL_BLOCKS, {},
+                                               params.message_modulus - 2);
+    zero_out_if_overflow_did_not_happen[1]
+        ->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
+                                               LUT_0_FOR_ALL_BLOCKS, {},
+                                               params.message_modulus - 1);

    // create and generate zero_out_if_overflow_happened
    zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -1110,24 +1084,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      }
    };

-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_happened[0]->get_lut(0, 0),
-        zero_out_if_overflow_happened[0]->get_degree(0),
-        zero_out_if_overflow_happened[0]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
-        gpu_memory_allocated);
-    zero_out_if_overflow_happened[0]->broadcast_lut(active_streams);
-    generate_device_accumulator_bivariate_with_factor<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        zero_out_if_overflow_happened[1]->get_lut(0, 0),
-        zero_out_if_overflow_happened[1]->get_degree(0),
-        zero_out_if_overflow_happened[1]->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
-        gpu_memory_allocated);
-    zero_out_if_overflow_happened[1]->broadcast_lut(active_streams);
+    zero_out_if_overflow_happened[0]->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
+        params.message_modulus - 2);
+    zero_out_if_overflow_happened[1]->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
+        params.message_modulus - 1);

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
@@ -1141,14 +1103,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          merge_overflow_flags_luts[i]->get_lut(0, 0),
-          merge_overflow_flags_luts[i]->get_degree(0),
-          merge_overflow_flags_luts[i]->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f_bit, gpu_memory_allocated);
-      merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
+      merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
+          active_gpu_count_for_bits, {0}, {lut_f_bit}, LUT_0_FOR_ALL_BLOCKS);
    }
  }

@@ -1220,29 +1176,34 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          (Torus *)cuda_malloc_with_size_tracking_async(
              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
              size_tracker, allocate_gpu_memory);
-      for (int index = 0; index < nb; index++) {
-        uint32_t grouping_index = index / group_size;
-        bool is_in_first_grouping = (grouping_index == 0);
-        uint32_t index_in_grouping = index % group_size;
-        bool is_last_index = (index == (nb - 1));
-        if (is_last_index) {
-          if (nb == 1) {
-            h_lut_indexes[index] = 2 * group_size;
+
+      auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+          bool is_last_index = (index == (nb - 1));
+          if (is_last_index) {
+            if (nb == 1) {
+              h_lut_indexes[index] = 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2;
+            }
+          } else if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
          } else {
-            h_lut_indexes[index] = 2;
+            h_lut_indexes[index] = index_in_grouping + group_size;
          }
-        } else if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
        }
-      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-          allocate_gpu_memory);
+      };
+
+      generate_lut_indexes<Torus>(
+          streams, index_generator, first_indexes_for_overflow_sub[nb - 1], nb,
+          2 * group_size + 1, h_lut_indexes, allocate_gpu_memory);
    }
    // Extra indexes for the luts in second step
+    uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
+    uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
    for (int nb = 1; nb <= num_blocks; nb++) {
      second_indexes_for_overflow_sub[nb - 1] =
          (Torus *)cuda_malloc_with_size_tracking_async(
@@ -1253,24 +1214,36 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
              nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
              size_tracker, allocate_gpu_memory);

+      auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
+                                                       uint32_t) {
+        for (int index = 0; index < nb; index++) {
+          uint32_t grouping_index = index / group_size;
+          bool is_in_first_grouping = (grouping_index == 0);
+          uint32_t index_in_grouping = index % group_size;
+
+          if (is_in_first_grouping) {
+            h_lut_indexes[index] = index_in_grouping;
+          } else if (index_in_grouping == (group_size - 1)) {
+            if (use_seq) {
+              int inner_index = (grouping_index - 1) % (group_size - 1);
+              h_lut_indexes[index] = inner_index + 2 * group_size;
+            } else {
+              h_lut_indexes[index] = 2 * group_size;
+            }
+          } else {
+            h_lut_indexes[index] = index_in_grouping + group_size;
+          }
+        }
+      };
+
+      generate_lut_indexes<Torus>(
+          streams, index_generator, second_indexes_for_overflow_sub[nb - 1], nb,
+          num_luts_second_step, h_lut_indexes, allocate_gpu_memory);
+
      for (int index = 0; index < nb; index++) {
        uint32_t grouping_index = index / group_size;
        bool is_in_first_grouping = (grouping_index == 0);
        uint32_t index_in_grouping = index % group_size;
-
-        if (is_in_first_grouping) {
-          h_lut_indexes[index] = index_in_grouping;
-        } else if (index_in_grouping == (group_size - 1)) {
-          if (use_seq) {
-            int inner_index = (grouping_index - 1) % (group_size - 1);
-            h_lut_indexes[index] = inner_index + 2 * group_size;
-          } else {
-            h_lut_indexes[index] = 2 * group_size;
-          }
-        } else {
-          h_lut_indexes[index] = index_in_grouping + group_size;
-        }
-
        bool may_have_its_padding_bit_set =
            !is_in_first_grouping && (index_in_grouping == group_size - 1);

@@ -1284,10 +1257,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          h_scalar[index] = 0;
        }
      }
-      cuda_memcpy_with_size_tracking_async_to_gpu(
-          second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
-          nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-          allocate_gpu_memory);
      cuda_memcpy_with_size_tracking_async_to_gpu(
          scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
          streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
@@ -1557,16 +1526,12 @@ template <typename Torus> struct int_div_rem_memory {
      compare_signed_bits_lut = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          compare_signed_bits_lut->get_lut(0, 0),
-          compare_signed_bits_lut->get_degree(0),
-          compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
-      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
+
+      compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
+          active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
+          LUT_0_FOR_ALL_BLOCKS);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -53,13 +53,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return count;
    };

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
-        univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
-
-    univ_lut_mem->broadcast_lut(active_streams);
+    univ_lut_mem->generate_and_broadcast_lut(
+        active_streams, {0}, {generate_uni_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);

    auto generate_bi_lut_lambda =
        [num_bits](Torus block_num_bit_count,
@@ -70,13 +65,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return 0;
    };

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
-        biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
-
-    biv_lut_mem->broadcast_lut(active_streams);
+    biv_lut_mem->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {generate_bi_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);

    this->tmp_ct = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -232,7 +222,7 @@ template <typename Torus> struct int_ilog2_buffer {
        this->sum_output_not_propagated, counter_num_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

-    this->lut_message_not =
+    lut_message_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
                                 allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus)> lut_message_lambda =
@@ -240,16 +230,11 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t message = x % this->params.message_modulus;
      return (~message) % this->params.message_modulus;
    };
-    generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
-                                this->lut_message_not->get_lut(0, 0),
-                                this->lut_message_not->get_degree(0),
-                                this->lut_message_not->get_max_degree(0),
-                                params.glwe_dimension, params.polynomial_size,
-                                params.message_modulus, params.carry_modulus,
-                                lut_message_lambda, allocate_gpu_memory);
+
    auto active_streams =
        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
-    lut_message_not->broadcast_lut(active_streams);
+    lut_message_not->generate_and_broadcast_lut(
+        active_streams, {0}, {lut_message_lambda}, LUT_0_FOR_ALL_BLOCKS);

    this->lut_carry_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -259,13 +244,8 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t carry = x / this->params.message_modulus;
      return (~carry) % this->params.message_modulus;
    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0),
-        this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
-        this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        lut_carry_lambda, allocate_gpu_memory);
-    lut_carry_not->broadcast_lut(active_streams);
+    lut_carry_not->generate_and_broadcast_lut(
+        active_streams, {0}, {lut_carry_lambda}, LUT_0_FOR_ALL_BLOCKS);

    this->message_blocks_not = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -37,17 +37,12 @@ template <typename Torus> struct int_mul_memory {
      zero_out_predicate_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          zero_out_predicate_lut->get_lut(0, 0),
-          zero_out_predicate_lut->get_degree(0),
-          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          zero_out_predicate_lut_f, gpu_memory_allocated);

      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      zero_out_predicate_lut->broadcast_lut(active_streams);
+      zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {zero_out_predicate_lut_f},
+          LUT_0_FOR_ALL_BLOCKS);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -55,10 +50,7 @@ template <typename Torus> struct int_mul_memory {
      return;
    }

-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    // 'vector_result_lsb' contains blocks from all possible shifts of
    // radix_lwe_left excluding zero ciphertext blocks
@@ -70,6 +62,10 @@ template <typename Torus> struct int_mul_memory {

    int total_block_count = num_radix_blocks * num_radix_blocks;

+    GPU_ASSERT(lsb_vector_block_count + msb_vector_block_count ==
+                   total_block_count,
+               "MSB and LSB vector block counts don't match");
+
    // allocate memory for intermediate buffers
    vector_result_sb = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -91,8 +87,6 @@ template <typename Torus> struct int_mul_memory {
    // luts_array -> lut = {lsb_acc, msb_acc}
    luts_array = new int_radix_lut<Torus>(streams, params, 2, total_block_count,
                                          allocate_gpu_memory, size_tracker);
-    auto lsb_acc = luts_array->get_lut(0, 0);
-    auto msb_acc = luts_array->get_lut(0, 1);

    // define functions for each accumulator
    auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
@@ -102,30 +96,21 @@ template <typename Torus> struct int_mul_memory {
      return (x * y) / message_modulus;
    };

-    // generate accumulators
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), lsb_acc,
-        luts_array->get_degree(0), luts_array->get_max_degree(0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        lut_f_lsb, gpu_memory_allocated);
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), msb_acc,
-        luts_array->get_degree(1), luts_array->get_max_degree(1),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        lut_f_msb, gpu_memory_allocated);
-
    // lut_indexes_vec for luts_array should be reinitialized
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
    // for message and carry default lut_indexes_vec is fine
-    if (allocate_gpu_memory)
-      cuda_set_value_async<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
-          msb_vector_block_count);
    auto active_streams =
        streams.active_gpu_subset(total_block_count, params.pbs_type);
-    luts_array->broadcast_lut(active_streams);
+    auto lut_index_generator = [lsb_vector_block_count](Torus *h_lut_indexes,
+                                                        uint32_t num_indexes) {
+      for (uint32_t i = 0; i < num_indexes; i++) {
+        h_lut_indexes[i] = (i < lsb_vector_block_count) ? 0 : 1;
+      }
+    };
+    luts_array->generate_and_broadcast_bivariate_lut(
+        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, lut_index_generator);
+
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, params, num_radix_blocks, 2 * num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
    uint32_t calculated_active_blocks =
        total_random_bits == 0
            ? 0
-            : (total_random_bits + message_bits_per_block - 1) /
-                  message_bits_per_block;
+            : CEIL_DIV(total_random_bits, message_bits_per_block);
    if (num_blocks_to_process != calculated_active_blocks) {
      PANIC(
          "num_blocks_to_process should be equal to calculated_active_blocks");
@@ -53,6 +52,10 @@ template <typename Torus> struct int_grouped_oprf_memory {

    // Pre-generate all possible LUTs.
    //
+    std::vector<std::function<Torus(Torus)>> lut_funcs;
+    std::vector<uint32_t> lut_indices;
+    std::vector<uint64_t> lut_degrees;
+
    for (uint32_t random_bit = 1; random_bit <= message_bits_per_block;
         ++random_bit) {
      uint64_t p = 1ULL << random_bit;
@@ -70,14 +73,13 @@ template <typename Torus> struct int_grouped_oprf_memory {

      uint64_t degree = 0;
      uint32_t lut_index = random_bit - 1;
-      generate_device_accumulator_no_encoding<Torus>(
-          streams.stream(0), streams.gpu_index(0), luts->get_lut(0, lut_index),
-          degree, params.message_modulus, params.carry_modulus,
-          params.glwe_dimension, params.polynomial_size, lut_f,
-          allocate_gpu_memory);
+
+      lut_funcs.push_back(lut_f);
+      lut_indices.push_back(lut_index);
+
      // In  OPRF the degree is hard set to p - 1 instead of the LUT degree
      degree = p - 1;
-      *luts->get_degree(lut_index) = degree;
+      lut_degrees.push_back(degree);
    }

    // For each block, this loop determines the exact number of bits to generate
@@ -102,10 +104,6 @@ template <typename Torus> struct int_grouped_oprf_memory {
      Torus plaintext_to_add = (p - 1) * delta / 2;

      h_corrections[i * lwe_size + params.big_lwe_dimension] = plaintext_to_add;
-      if (bits_for_this_block < 1) {
-        PANIC("bits_for_this_block should be greater than 1");
-      }
-      this->h_lut_indexes[i] = bits_for_this_block - 1;

      bits_processed += bits_for_this_block;
    }
@@ -122,13 +120,35 @@ template <typename Torus> struct int_grouped_oprf_memory {

    // Copy the prepared LUT indexes to the GPU 0, before broadcast to all other
    // GPUs.
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        luts->get_lut_indexes(0, 0), this->h_lut_indexes,
-        num_blocks_to_process * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
-    luts->broadcast_lut(active_streams);
+    // No encoding for these LUTS. Generate LUT also sets LUT degrees to default
+    // values
+    auto luts_index_generator = [total_random_bits, message_bits_per_block](
+                                    Torus *h_lut_indexes, uint32_t num_blocks) {
+      uint64_t bits_processed = 0;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        if (total_random_bits <= bits_processed) {
+          PANIC("total_random_bits should be greater than bits_processed");
+        }
+        uint64_t bits_remaining = total_random_bits - bits_processed;
+        uint32_t bits_for_this_block =
+            std::min((uint64_t)message_bits_per_block, bits_remaining);
+        if (bits_for_this_block < 1) {
+          PANIC("bits_for_this_block should be greater than 1");
+        }
+        h_lut_indexes[i] = bits_for_this_block - 1;
+        bits_processed += bits_for_this_block;
+      }
+    };
+    luts->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
+                                     luts_index_generator, false, {},
+                                     this->h_lut_indexes);
+
+    // OPRF requires custom LUT degrees
+    for (uint32_t i = 0; i < lut_degrees.size(); ++i) {
+      *luts->get_degree(i) = lut_degrees[i];
+    }

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_corrections);
@@ -170,8 +190,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
    this->allocate_gpu_memory = allocate_gpu_memory;

    this->num_random_input_blocks =
-        (num_input_random_bits + message_bits_per_block - 1) /
-        message_bits_per_block;
+        CEIL_DIV(num_input_random_bits, message_bits_per_block);

    this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
        streams, params, this->num_random_input_blocks, message_bits_per_block,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -85,15 +85,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
-          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          shift_lut_f, gpu_memory_allocated);
+
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->broadcast_lut(active_streams);
+      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -172,16 +168,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
-          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          shift_lut_f, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->broadcast_lut(active_streams);
-
+      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
@@ -271,16 +261,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return shifted | padding;
      };

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          shift_last_block_lut_univariate->get_lut(0, 0),
-          shift_last_block_lut_univariate->get_degree(0),
-          shift_last_block_lut_univariate->get_max_degree(0),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
      auto active_streams_shift_last =
          streams.active_gpu_subset(1, params.pbs_type);
-      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
+      shift_last_block_lut_univariate->generate_and_broadcast_lut(
+          active_streams_shift_last, {0}, {last_block_lut_f},
+          LUT_0_FOR_ALL_BLOCKS);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -298,15 +283,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      return (params.message_modulus - 1) * x_sign_bit;
    };

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        padding_block_lut_univariate->get_lut(0, 0),
-        padding_block_lut_univariate->get_degree(0),
-        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        padding_block_lut_f, gpu_memory_allocated);
-    // auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-    padding_block_lut_univariate->broadcast_lut(active_streams);
+    padding_block_lut_univariate->generate_and_broadcast_lut(
+        active_streams, {0}, {padding_block_lut_f}, LUT_0_FOR_ALL_BLOCKS);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -339,16 +317,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return message_of_current_block + carry_of_previous_block;
      };

-      generate_device_accumulator_bivariate<Torus>(
-          streams.stream(0), streams.gpu_index(0),
-          shift_blocks_lut_bivariate->get_lut(0, 0),
-          shift_blocks_lut_bivariate->get_degree(0),
-          shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
+      shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
+          active_streams_shift_blocks, {0}, {blocks_lut_f},
+          LUT_0_FOR_ALL_BLOCKS);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -113,27 +113,20 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
      else
        return current_bit;
    };
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
-        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
    auto active_gpu_count_mux = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);
-    mux_lut->broadcast_lut(active_gpu_count_mux);
+
+    mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
+                                        LUT_0_FOR_ALL_BLOCKS);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
      return x % params.message_modulus;
    };
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
-        cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
+
    auto active_gpu_count_cleaning =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
+    cleaning_lut->generate_and_broadcast_lut(
+        active_gpu_count_cleaning, {0}, {cleaning_lut_f}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -74,45 +74,27 @@ template <typename Torus> struct int_overflowing_sub_memory {
                                           luts_array, size_tracker,
                                           allocate_gpu_memory, size_tracker);

-    auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
-    auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
-
-    // generate luts (aka accumulators)
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
-        luts_array->get_degree(0), luts_array->get_max_degree(0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        f_lut_does_block_generate_carry, gpu_memory_allocated);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        lut_does_block_generate_or_propagate, luts_array->get_degree(1),
-        luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
-        gpu_memory_allocated);
-    if (allocate_gpu_memory)
-      cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                  luts_array->get_lut_indexes(0, 1), 1,
-                                  num_radix_blocks - 1);
-
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        luts_borrow_propagation_sum->get_lut(0, 0),
-        luts_borrow_propagation_sum->get_degree(0),
-        luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
-        f_luts_borrow_propagation_sum, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
-        message_acc->get_degree(0), message_acc->get_max_degree(0),
-        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-        f_message_acc, gpu_memory_allocated);
-
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_array->broadcast_lut(active_streams);
-    luts_borrow_propagation_sum->broadcast_lut(active_streams);
-    message_acc->broadcast_lut(active_streams);
+    luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
+        active_streams, {0}, {f_luts_borrow_propagation_sum},
+        LUT_0_FOR_ALL_BLOCKS);
+
+    auto luts_array_index_generator = [](Torus *h_lut_indexes,
+                                         uint32_t num_indexes) {
+      for (uint32_t i = 0; i < num_indexes; i++) {
+        h_lut_indexes[i] = (i == 0) ? 0 : 1;
+      }
+    };
+    luts_array->generate_and_broadcast_lut(
+        active_streams, {0, 1},
+        {f_lut_does_block_generate_carry,
+         f_lut_does_block_generate_or_propagate},
+        luts_array_index_generator);
+    // generate luts (aka accumulators)
+
+    message_acc->generate_and_broadcast_lut(
+        active_streams, {0}, {f_message_acc}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -7,7 +7,8 @@
 #include <functional>
 #include <vector>

-const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
+// If we use more than 5 streams the result is incorrect
+const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;

 template <typename Torus> struct int_equality_selectors_buffer {
  int_radix_params params;
@@ -60,18 +61,10 @@ template <typename Torus> struct int_equality_selectors_buffer {
      fns.push_back([i](Torus x) -> Torus { return (x == i); });
    }

-    generate_many_lut_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->comparison_luts->get_lut(0, 0),
-        this->comparison_luts->get_degree(0),
-        this->comparison_luts->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        fns, allocate_gpu_memory);
-
+    this->comparison_luts->generate_and_broadcast_many_lut(
+        active_streams, {0}, {fns}, LUT_0_FOR_ALL_BLOCKS);
    fns.clear();

-    this->comparison_luts->broadcast_lut(active_streams);
-
    this->tmp_many_luts_output = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), this->tmp_many_luts_output,
@@ -175,8 +168,7 @@ template <typename Torus> struct int_possible_results_buffer {
    this->lut_stride =
        (ciphertext_modulus / this->max_luts_per_call) * box_size;

-    this->num_lut_accumulators =
-        (total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
+    this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);

    stream_luts =
        new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
@@ -202,15 +194,10 @@ template <typename Torus> struct int_possible_results_buffer {
          fns.push_back([c](Torus x) -> Torus { return (x == 1) * c; });
        }

-        generate_many_lut_device_accumulator<Torus>(
-            streams.stream(0), streams.gpu_index(0), current_lut->get_lut(0, 0),
-            current_lut->get_degree(0), current_lut->get_max_degree(0),
-            params.glwe_dimension, params.polynomial_size,
-            params.message_modulus, params.carry_modulus, fns,
-            allocate_gpu_memory);
+        current_lut->generate_and_broadcast_many_lut(
+            streams.active_gpu_subset(1, params.pbs_type), {0}, {fns},
+            LUT_0_FOR_ALL_BLOCKS);

-        current_lut->broadcast_lut(
-            streams.active_gpu_subset(1, params.pbs_type));
        stream_luts[lut_count++] = current_lut;
        lut_value_start += luts_in_this_call;
      }
@@ -298,14 +285,10 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
      int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      generate_device_accumulator<Torus>(
-          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          id_fn, allocate_gpu_memory);
+      lut->generate_and_broadcast_lut(
+          streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
+          LUT_0_FOR_ALL_BLOCKS);

-      lut->broadcast_lut(
-          streams.active_gpu_subset(num_blocks, params.pbs_type));
      this->stream_identity_luts[i] = lut;
    }

@@ -318,27 +301,17 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->message_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->message_extract_lut->get_lut(0, 0),
-        this->message_extract_lut->get_degree(0),
-        this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        msg_fn, allocate_gpu_memory);
-    this->message_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type));
+
+    this->message_extract_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
+        LUT_0_FOR_ALL_BLOCKS);

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->carry_extract_lut->get_lut(0, 0),
-        this->carry_extract_lut->get_degree(0),
-        this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        carry_fn, allocate_gpu_memory);
-    this->carry_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type));
+
+    this->carry_extract_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
+        LUT_0_FOR_ALL_BLOCKS);

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -1185,15 +1158,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->prefix_sum_lut->get_lut(0, 0),
-        this->prefix_sum_lut->get_degree(0),
-        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1203,14 +1170,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
-        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->cleanup_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
@@ -1376,15 +1338,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->prefix_sum_lut->get_lut(0, 0),
-        this->prefix_sum_lut->get_degree(0),
-        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1394,14 +1350,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
-        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
-        params.polynomial_size, params.message_modulus, params.carry_modulus,
-        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type));
+    this->cleanup_lut->generate_and_broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
+        {cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
@@ -73,9 +73,10 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
                                                int8_t **fp_ks_buffer,
                                                bool gpu_memory_allocated);

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count);
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium.h
@@ -0,0 +1,24 @@
+#ifndef KREYVIUM_H
+#define KREYVIUM_H
+
+#include "../integer/integer.h"
+
+extern "C" {
+uint64_t scratch_cuda_kreyvium_64(
+    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
+
+void cuda_kreyvium_generate_keystream_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
+    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
+    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks);
+
+void cleanup_cuda_kreyvium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/kreyvium/kreyvium_utilities.h
@@ -0,0 +1,320 @@
+#ifndef KREYVIUM_UTILITIES_H
+#define KREYVIUM_UTILITIES_H
+#include "../integer/integer_utilities.h"
+
+// Kreyvium specific constants
+// The batch size is set to 64 to allow efficient parallel processing of 64
+// steps at once.
+constexpr uint32_t KREYVIUM_BATCH_SIZE = 64;
+
+// In each Kreyvium step, there are exactly 3 non-linear AND operations:
+// 1. (c109 & c108)
+// 2. (a91 & a90)
+// 3. (b82 & b81)
+constexpr uint32_t KREYVIUM_NUM_AND_GATES = 3;
+
+// In each Kreyvium step, there are 4 paths that require a "flush"
+// to noise-cancel and extract the bit:
+// 1. New bit for Register A
+// 2. New bit for Register B
+// 3. New bit for Register C
+// 4. The Output Keystream bit
+constexpr uint32_t KREYVIUM_NUM_FLUSH_PATHS = 4;
+
+/// Struct to hold the LUTs.
+template <typename Torus> struct int_kreyvium_lut_buffers {
+  // Bivariate AND Gate LUT:
+  // AND operation: f(a, b) = (a & 1) & (b & 1).
+  // This is a Bivariate PBS used for the non-linear parts of Kreyvium.
+  int_radix_lut<Torus> *and_lut;
+
+  // Univariate Flush/Identity LUT:
+  // MESSAGE EXTRACTION operation: f(x) = x & 1.
+  // This is a Univariate PBS used to "flush" the state (reset noise/carries).
+  int_radix_lut<Torus> *flush_lut;
+
+  int_kreyvium_lut_buffers(CudaStreams streams, const int_radix_params &params,
+                           bool allocate_gpu_memory, uint32_t num_inputs,
+                           uint64_t &size_tracker) {
+
+    uint32_t and_ops =
+        num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_AND_GATES;
+    uint32_t flush_ops =
+        num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_FLUSH_PATHS;
+
+    this->and_lut = new int_radix_lut<Torus>(streams, params, 1, and_ops,
+                                             allocate_gpu_memory, size_tracker);
+
+    std::function<Torus(Torus, Torus)> and_lambda =
+        [](Torus lhs, Torus rhs) -> Torus { return (lhs & 1) & (rhs & 1); };
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
+
+    auto active_streams_and =
+        streams.active_gpu_subset(and_ops, params.pbs_type);
+    this->and_lut->broadcast_lut(active_streams_and);
+    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
+
+    this->flush_lut = new int_radix_lut<Torus>(
+        streams, params, 1, flush_ops, allocate_gpu_memory, size_tracker);
+
+    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
+      return x & 1;
+    };
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
+
+    auto active_streams_flush =
+        streams.active_gpu_subset(flush_ops, params.pbs_type);
+    this->flush_lut->broadcast_lut(active_streams_flush);
+    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    this->and_lut->release(streams);
+    delete this->and_lut;
+    this->and_lut = nullptr;
+
+    this->flush_lut->release(streams);
+    delete this->flush_lut;
+    this->flush_lut = nullptr;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+/// Struct to hold the Kreyvium internal state and temporary workspaces.
+template <typename Torus> struct int_kreyvium_state_workspaces {
+
+  CudaRadixCiphertextFFI *a_reg;
+  CudaRadixCiphertextFFI *b_reg;
+  CudaRadixCiphertextFFI *c_reg;
+  CudaRadixCiphertextFFI *k_reg;
+  CudaRadixCiphertextFFI *iv_reg;
+
+  // Shift Workspace
+  CudaRadixCiphertextFFI *shift_workspace;
+
+  // Temporary Update Buffers
+  CudaRadixCiphertextFFI *temp_a;
+  CudaRadixCiphertextFFI *temp_b;
+  CudaRadixCiphertextFFI *temp_c;
+
+  CudaRadixCiphertextFFI *packed_and_lhs;
+  CudaRadixCiphertextFFI *packed_and_rhs;
+  CudaRadixCiphertextFFI *packed_and_out;
+
+  // Flush/Cleanup Packing Buffers
+  CudaRadixCiphertextFFI *packed_flush_in;
+  CudaRadixCiphertextFFI *packed_flush_out;
+
+  uint32_t max_batch_blocks;
+  uint32_t k_offset;
+  uint32_t iv_offset;
+
+  int_kreyvium_state_workspaces(CudaStreams streams,
+                                const int_radix_params &params,
+                                bool allocate_gpu_memory, uint32_t num_inputs,
+                                uint64_t &size_tracker) {
+
+    uint32_t batch_blocks = KREYVIUM_BATCH_SIZE * num_inputs;
+    this->max_batch_blocks = batch_blocks;
+    this->k_offset = 0;
+    this->iv_offset = 0;
+
+    this->a_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->b_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->c_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->k_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->k_reg, 128 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->iv_reg = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->iv_reg, 128 * num_inputs,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->shift_workspace = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->shift_workspace,
+        128 * num_inputs, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->temp_a = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_a, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->temp_b = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_b, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->temp_c = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->temp_c, batch_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->packed_and_lhs = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_and_lhs,
+        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_and_rhs = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_and_rhs,
+        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_and_out = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_and_out,
+        KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_flush_in = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
+        KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+
+    this->packed_flush_out = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
+        KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
+        size_tracker, allocate_gpu_memory);
+  }
+
+  void release(CudaStreams streams, bool allocate_gpu_memory) {
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->a_reg, allocate_gpu_memory);
+    delete this->a_reg;
+    this->a_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->b_reg, allocate_gpu_memory);
+    delete this->b_reg;
+    this->b_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->c_reg, allocate_gpu_memory);
+    delete this->c_reg;
+    this->c_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->k_reg, allocate_gpu_memory);
+    delete this->k_reg;
+    this->k_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->iv_reg, allocate_gpu_memory);
+    delete this->iv_reg;
+    this->iv_reg = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->shift_workspace, allocate_gpu_memory);
+    delete this->shift_workspace;
+    this->shift_workspace = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_a, allocate_gpu_memory);
+    delete this->temp_a;
+    this->temp_a = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_b, allocate_gpu_memory);
+    delete this->temp_b;
+    this->temp_b = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->temp_c, allocate_gpu_memory);
+    delete this->temp_c;
+    this->temp_c = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_and_lhs, allocate_gpu_memory);
+    delete this->packed_and_lhs;
+    this->packed_and_lhs = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_and_rhs, allocate_gpu_memory);
+    delete this->packed_and_rhs;
+    this->packed_and_rhs = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_and_out, allocate_gpu_memory);
+    delete this->packed_and_out;
+    this->packed_and_out = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_flush_in, allocate_gpu_memory);
+    delete this->packed_flush_in;
+    this->packed_flush_in = nullptr;
+
+    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
+                                   this->packed_flush_out, allocate_gpu_memory);
+    delete this->packed_flush_out;
+    this->packed_flush_out = nullptr;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+template <typename Torus> struct int_kreyvium_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t num_inputs;
+
+  int_kreyvium_lut_buffers<Torus> *luts;
+  int_kreyvium_state_workspaces<Torus> *state;
+
+  int_kreyvium_buffer(CudaStreams streams, const int_radix_params &params,
+                      bool allocate_gpu_memory, uint32_t num_inputs,
+                      uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->num_inputs = num_inputs;
+
+    this->luts = new int_kreyvium_lut_buffers<Torus>(
+        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
+
+    this->state = new int_kreyvium_state_workspaces<Torus>(
+        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
+  }
+
+  void release(CudaStreams streams) {
+    luts->release(streams);
+    delete luts;
+    luts = nullptr;
+
+    state->release(streams, allocate_gpu_memory);
+    delete state;
+    state = nullptr;
+
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+  }
+};
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -429,11 +429,9 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
 }

 template <typename Torus>
-bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
-                                                   uint32_t polynomial_size,
-                                                   uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   uint32_t max_shared_memory);
+bool has_support_to_cuda_programmable_bootstrap_cg(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples, uint32_t max_shared_memory, uint32_t base_log);

 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -13,12 +13,12 @@ void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index,
 void cuda_convert_lwe_programmable_bootstrap_key_32(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size);
+    uint32_t polynomial_size, uint32_t base_log);

 void cuda_convert_lwe_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size);
+    uint32_t polynomial_size, uint32_t base_log);

 void cuda_convert_lwe_programmable_bootstrap_key_128(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
@@ -61,13 +61,13 @@ uint64_t scratch_cuda_programmable_bootstrap_32(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t base_log);

 uint64_t scratch_cuda_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t base_log);

 uint64_t scratch_cuda_programmable_bootstrap_128(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
@@ -30,15 +30,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };

-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
-        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, and_lambda, allocate_gpu_memory);
-
    auto active_streams_and =
        streams.active_gpu_subset(total_lut_ops, params.pbs_type);
-    this->and_lut->broadcast_lut(active_streams_and);
+    this->and_lut->generate_and_broadcast_bivariate_lut(
+        active_streams_and, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
@@ -50,15 +45,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
      return x & 1;
    };

-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
-        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, flush_lambda, allocate_gpu_memory);
-
    auto active_streams_flush =
        streams.active_gpu_subset(total_flush_ops, params.pbs_type);
-    this->flush_lut->broadcast_lut(active_streams_flush);
+    this->flush_lut->generate_and_broadcast_lut(
+        active_streams_flush, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
    uint32_t casting_output_dimension, uint32_t casting_ks_level,
    uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
-    const bool *is_boolean_array, uint32_t num_compact_lists,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    KS_TYPE casting_key_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    const bool *is_boolean_array, const uint32_t is_boolean_array_len,
+    uint32_t num_compact_lists, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -118,7 +118,8 @@ template <typename Torus> struct zk_expand_mem {
  zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
-                const bool *is_boolean_array, uint32_t num_compact_lists,
+                const bool *is_boolean_array,
+                const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
@@ -174,40 +175,6 @@ template <typename Torus> struct zk_expand_mem {
    message_and_carry_extract_luts = new int_radix_lut<Torus>(
        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);

-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 0),
-        message_and_carry_extract_luts->get_degree(0),
-        message_and_carry_extract_luts->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 1),
-        message_and_carry_extract_luts->get_degree(1),
-        message_and_carry_extract_luts->get_max_degree(1),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 2),
-        message_and_carry_extract_luts->get_degree(2),
-        message_and_carry_extract_luts->get_max_degree(2),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
-        gpu_memory_allocated);
-
-    generate_device_accumulator<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        message_and_carry_extract_luts->get_lut(0, 3),
-        message_and_carry_extract_luts->get_degree(3),
-        message_and_carry_extract_luts->get_max_degree(3),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
-        gpu_memory_allocated);
-
    // We are always packing two LWEs. We just need to be sure we have enough
    // space in the carry part to store a message of the same size as is in the
    // message part.
@@ -270,29 +237,65 @@ template <typename Torus> struct zk_expand_mem {
      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
        auto lwe_index = i + num_packed_msgs * offset;
        auto lwe_index_in_list = i % num_lwes_in_kth;
+        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       lwe_index, num_packed_msgs * num_lwes);
        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
        h_indexes_out[lwe_index] =
            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        // If the input relates to a boolean, shift the LUT so the correct one
-        // with sanitization is used
-        auto boolean_offset =
-            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
-        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
+        // is_boolean_array tells us which input is a boolean and thus the
+        // related output needs boolean sanitization. It naturally has
+        // total_blocks entries, but h_indexes_out reaches
+        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
+        // the ceiling causes out-of-bounds access. Reading garbage "true" would
+        // set h_lut_indexes to an invalid index pointing to uninitialized
+        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
+        // to match.
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                       "Cuda error: index %d for is_boolean_array is out of "
+                       "bounds (len is %d)",
+                       h_indexes_out[lwe_index], is_boolean_array_len);
      }
      offset += num_lwes_in_kth;
    }

    message_and_carry_extract_luts->set_lwe_indexes(
        streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
-    auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
-
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
-        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

    auto active_streams =
        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
-    message_and_carry_extract_luts->broadcast_lut(active_streams);
+
+    // Index generator for message/carry extraction LUTs
+    auto index_gen = [num_compact_lists,
+                      num_lwes_per_compact_list =
+                          this->num_lwes_per_compact_list,
+                      num_packed_msgs, is_boolean_array,
+                      h_indexes_out](Torus *h_lut_indexes, uint32_t) {
+      auto offset = 0;
+      for (int k = 0; k < num_compact_lists; k++) {
+        auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+        for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
+          auto lwe_index = i + num_packed_msgs * offset;
+          auto boolean_offset =
+              is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
+          h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+        }
+        offset += num_lwes_in_kth;
+      }
+    };
+
+    message_and_carry_extract_luts->generate_and_broadcast_lut(
+        active_streams, {0, 1, 2, 3},
+        {message_extract_lut_f, carry_extract_lut_f,
+         message_extract_and_sanitize_bool_lut_f,
+         carry_extract_and_sanitize_bool_lut_f},
+        index_gen, true, {}, h_lut_indexes);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -183,9 +183,10 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
      base_log, level_count, num_lwes);
 }

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count) {
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count) {
  host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
                                  static_cast<const uint64_t *>(input),
                                  static_cast<uint64_t *>(output), base_log,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -10,7 +10,6 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

@@ -351,6 +350,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
      Torus state =
          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
      uint32_t offset = i * level_count * (lwe_dimension_out + 1);
+#pragma unroll 1
      for (int j = 0; j < level_count; j++) {

        KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -363,16 +363,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    lwe_acc_out[shmem_index] = local_lwe_out;
  }

-  if (tid <= lwe_dimension_out) {
-    for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
-         offset /= 2) {
-      __syncthreads();
+  for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
+    __syncthreads();
+    if (tid <= lwe_dimension_out && threadIdx.y < offset) {
      lwe_acc_out[shmem_index] +=
          lwe_acc_out[shmem_index + offset * blockDim.x];
    }
-    if (threadIdx.y == 0)
-      block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
  }
+  if (tid <= lwe_dimension_out && threadIdx.y == 0)
+    block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
 }

 template <typename Torus, typename KSTorus>
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -12,12 +12,9 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
 // Finish the keyswitching operation and prepare GLWEs for accumulation.
 // 1. Finish the keyswitching computation partially performed with a GEMM:
 //  - negate the dot product between the GLWE and KSK polynomial
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -6,7 +6,7 @@
 #include "helper_multi_gpu.h"
 #include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <limits>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,4 +1,5 @@
 #include "device.h"
+#include "utils/helper.cuh"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <mutex>
@@ -6,6 +7,27 @@
 #include <cuda_profiler_api.h>
 #endif

+void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+}
+
+int validate_device_ptr(const void *ptr) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+  return attr.device;
+}
+
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
@@ -247,13 +269,12 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 cudaStream_t stream,
                                                 uint32_t gpu_index,
                                                 bool gpu_memory_allocated) {
+
+  GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
+
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
@@ -280,28 +301,16 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    uint32_t gpu_index, bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  GPU_ASSERT(dest != nullptr,
-             "Cuda error: trying to copy gpu->gpu to null ptr");
-  GPU_ASSERT(src != nullptr,
-             "Cuda error: trying to copy gpu->gpu from null ptr");

-  cudaPointerAttributes attr_dest;
-  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  PANIC_IF_FALSE(
-      attr_dest.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
-  cudaPointerAttributes attr_src;
-  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  PANIC_IF_FALSE(
-      attr_src.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
+  int src_gpu_index = validate_device_ptr(src);
+  int dest_gpu_index = validate_device_ptr(dest);
  cuda_set_device(gpu_index);
-  if (attr_src.device == attr_dest.device) {
+  if (src_gpu_index == dest_gpu_index) {
    check_cuda_error(
        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
  } else {
-    check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
-                                         attr_src.device, size, stream));
+    check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
+                                         src_gpu_index, size, stream));
  }
 }
 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
@@ -349,11 +358,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }
@@ -383,7 +388,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    }
    cuda_set_device(gpu_index);
    int block_size = 256;
-    int num_blocks = (n + block_size - 1) / block_size;
+    int num_blocks = CEIL_DIV(n, block_size);

    // Launch the kernel
    cuda_set_value_kernel<Torus>
@@ -406,13 +411,10 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
 /// so it should be avoided at all costs
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
+  GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
  if (size == 0)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
-  }
+  validate_device_ptr_and_gpu_index(src, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -188,7 +188,7 @@ __device__ void NSMFFT_direct_2_2_params(double2 *A, double2 *fft_out,
  }

  Index twiddle_shift = 1;
-  for (Index l = LOG2_DEGREE - 1; l >= 5; --l) {
+  for (Index l = LOG2_DEGREE - 1; l > 5; --l) {
    Index lane_mask = 1 << (l - 1);
    Index thread_mask = (1 << l) - 1;
    twiddle_shift <<= 1;
@@ -221,8 +221,8 @@ __device__ void NSMFFT_direct_2_2_params(double2 *A, double2 *fft_out,
      tid = tid + STRIDE;
    }
  }
-
-  for (Index l = 4; l >= 1; --l) {
+  __syncthreads();
+  for (Index l = 5; l >= 1; --l) {
    Index lane_mask = 1 << (l - 1);
    Index thread_mask = (1 << l) - 1;
    twiddle_shift <<= 1;
@@ -425,7 +425,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
  }

  Index twiddle_shift = DEGREE;
-  for (Index l = 1; l <= 4; ++l) {
+  for (Index l = 1; l <= 5; ++l) {
    Index lane_mask = 1 << (l - 1);
    Index thread_mask = (1 << l) - 1;
    tid = threadIdx.x;
@@ -459,7 +459,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
    }
  }

-  for (Index l = 5; l <= LOG2_DEGREE - 1; ++l) {
+  for (Index l = 6; l <= LOG2_DEGREE - 1; ++l) {
    Index lane_mask = 1 << (l - 1);
    Index thread_mask = (1 << l) - 1;
    tid = threadIdx.x;
@@ -467,7 +467,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,

    // at this point registers are ready for the  butterfly
    tid = threadIdx.x;
-    __syncthreads();
+
 #pragma unroll
    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
      w = (u[i] - v[i]);
@@ -495,6 +495,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,

      tid = tid + STRIDE;
    }
+    __syncthreads();
  }

 // last iteration
@@ -540,6 +541,44 @@ __global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
  }
 }

+/*
+ * global batch fft
+ * does fft in half size
+ * unrolling half size fft result in half size + 1 elements
+ * this function must be called with actual degree
+ * function takes as input already compressed input
+ */
+template <class params, sharedMemDegree SMD>
+__global__ void batch_NSMFFT_classical_specialized(double2 *d_input,
+                                                   double2 *d_output,
+                                                   double2 *buffer) {
+  extern __shared__ double2 sharedMemoryFFT[];
+  // For specialized we will always have enough shared memory
+  double2 *fft = sharedMemoryFFT;
+  int tid = threadIdx.x;
+
+  double2 *shared_twiddles = fft + params::degree / 2;
+
+  double2 fft_regs[params::opt / 2];
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    shared_twiddles[tid] = negtwiddles[tid];
+    fft_regs[i] = d_input[blockIdx.x * (params::degree / 2) + tid];
+    tid = tid + params::degree / params::opt;
+  }
+  __syncthreads();
+
+  NSMFFT_direct_2_2_params<HalfDegree<params>>(fft, fft_regs, shared_twiddles);
+  __syncthreads();
+
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    d_output[blockIdx.x * (params::degree / 2) + tid] = fft_regs[i];
+    tid = tid + params::degree / params::opt;
+  }
+}
+
 /*
 * global batch polynomial multiplication
 * only used for fft tests
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -68,9 +68,15 @@ struct alignas(16) f128 {
    auto t = two_sum(a.lo, b.lo);

    double hi = s.hi;
+#ifdef __CUDA_ARCH__
+    double lo = __dadd_rn(s.lo, t.hi);
+    hi = __dadd_rn(hi, lo);
+    lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
+#else
    double lo = s.lo + t.hi;
    hi = hi + lo;
    lo = lo - (hi - s.hi);
+#endif

    return f128(hi, lo + t.lo);
  }
@@ -104,8 +110,13 @@ struct alignas(16) f128 {
  __host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
    auto s = two_diff(a.hi, b.hi);
    auto t = two_diff(a.lo, b.lo);
+#ifdef __CUDA_ARCH__
+    s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
+    return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
+#else
    s = quick_two_sum(s.hi, s.lo + t.hi);
    return quick_two_sum(s.hi, s.lo + t.lo);
+#endif
  }

  // Multiplication
@@ -220,16 +231,16 @@ struct f128x2 {
  // Subtraction
  __host__ __device__ friend f128x2 operator-(const f128x2 &a,
                                              const f128x2 &b) {
-    return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
-                  f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
+    return f128x2(f128::sub_estimate(a.re, b.re),
+                  f128::sub_estimate(a.im, b.im));
  }

  // Multiplication (complex multiplication)
  __host__ __device__ friend f128x2 operator*(const f128x2 &a,
                                              const f128x2 &b) {
+    const f128 a_im_b_im = f128::mul(a.im, b.im);
    f128 real_part =
-        f128::add(f128::mul(a.re, b.re),
-                  f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
+        f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
    f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
    return f128x2(real_part, imag_part);
  }
@@ -243,8 +254,8 @@ struct f128x2 {

  // Subtraction-assignment operator
  __host__ __device__ f128x2 &operator-=(const f128x2 &other) {
-    re = f128::add(re, f128(-other.re.hi, -other.re.lo));
-    im = f128::add(im, f128(-other.im.hi, -other.im.lo));
+    re = f128::sub_estimate(re, other.re);
+    im = f128::sub_estimate(im, other.im);
    return *this;
  }

@@ -261,12 +272,20 @@ struct f128x2 {
 };

 __host__ __device__ inline uint64_t double_to_bits(double d) {
+#ifdef __CUDA_ARCH__
+  uint64_t bits = __double_as_longlong(d);
+#else
  uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
+#endif
  return bits;
 }

 __host__ __device__ inline double bits_to_double(uint64_t bits) {
+#ifdef __CUDA_ARCH__
+  double d = __longlong_as_double(bits);
+#else
  double d = *reinterpret_cast<double *>(&bits);
+#endif
  return d;
 }

@@ -275,6 +294,8 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
  const double A = ONE << 52;
  const double B = ONE << 104;
  const double C = ONE << 76;
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double D = 340282366920938500000000000000000000000.;

  const __uint128_t threshold = (ONE << 104);
@@ -288,15 +309,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = A_bits | lower64;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - A;

    uint64_t B_bits = double_to_bits(B);
    uint64_t top64 = static_cast<uint64_t>(x >> 52);
    uint64_t bits_h = B_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
+#else
+    double l = l_temp - A;
    double h = h_temp - B;

    return (l + h);
+#endif

  } else {
    uint64_t C_bits = double_to_bits(C);
@@ -310,15 +336,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = C_bits | lower64 | mask_part;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - C;

    uint64_t D_bits = double_to_bits(D);
    uint64_t top64 = static_cast<uint64_t>(x >> 76);
    uint64_t bits_h = D_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
+#else
+    double l = l_temp - C;
    double h = h_temp - D;

    return (l + h);
+#endif
  }
 }

@@ -389,6 +420,8 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {

 __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  auto x = f128::sub_estimate(a, f128::f128_floor(a));
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__
  x.hi = __dmul_rn(x.hi, normalization);
@@ -398,7 +431,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  x.lo *= normalization;
 #endif

-  // TODO has to be round
+  x = f128::add_estimate(x, f128(0.5, 0.0));
  x = f128::f128_floor(x);

  __uint128_t x0 = f64_to_u128(x.hi);
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -12,8 +12,9 @@
 using Index = unsigned;

 #define NEG_TWID(i)                                                            \
-  f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]),               \
-         f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
+  f128x2(                                                                      \
+      f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])),  \
+      f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))

 #define F64x4_TO_F128x2(f128x2_reg, ind)                                       \
  f128x2_reg.re.hi = dt_re_hi[ind];                                            \
@@ -75,7 +76,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }
      tid = tid + STRIDE;
    }
    __syncthreads();
@@ -86,8 +91,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);
-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }
      w = NEG_TWID(tid / lane_mask + twiddle_shift);
      f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
      f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
@@ -151,7 +159,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      // keep one of the register for next iteration and store another one in sm
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }

      tid = tid + STRIDE;
    }
@@ -165,8 +177,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);

-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }

      tid = tid + STRIDE;
    }
@@ -218,7 +233,7 @@ __device__ void convert_u128_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re, const __uint128_t *in_im) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
@@ -241,7 +256,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -12,7 +12,7 @@
 #include "integer/subtraction.cuh"
 #include "pbs/programmable_bootstrap_classic.cuh"
 #include "pbs/programmable_bootstrap_multibit.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"

 // lwe_dimension + 1 threads
 // todo: This kernel MUST be refactored to a binary reduction
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(

  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -134,28 +134,26 @@ __host__ void are_all_comparisons_block_true(
        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
          return x == chunk_length;
        };
-        generate_device_accumulator_with_cpu_prealloc<Torus>(
-            streams.stream(0), streams.gpu_index(0),
-            is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
-            is_max_value_lut->get_max_degree(1), glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
-            is_equal_to_num_blocks_lut_f, true,
-            are_all_block_true_buffer->preallocated_h_lut);

-        Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes;
-        for (int index = 0; index < num_chunks; index++) {
-          if (index == num_chunks - 1) {
-            h_lut_indexes[index] = 1;
-          } else {
-            h_lut_indexes[index] = 0;
-          }
-        }
-        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
-                                 h_lut_indexes, num_chunks * sizeof(Torus),
-                                 streams.stream(0), streams.gpu_index(0));
+        auto num_blocks = is_max_value_lut->num_blocks;
        auto active_streams =
            streams.active_gpu_subset(num_chunks, params.pbs_type);
-        is_max_value_lut->broadcast_lut(active_streams);
+
+        // Index generator: last chunk uses LUT 1, others use LUT 0
+        auto index_gen = [num_chunks, num_blocks](Torus *h_lut_indexes,
+                                                  uint32_t) {
+          for (uint32_t index = 0; index < num_blocks; index++) {
+            if (index == num_chunks - 1) {
+              h_lut_indexes[index] = 1;
+            } else if (index < num_chunks - 1 || index >= num_chunks) {
+              h_lut_indexes[index] = 0;
+            }
+          }
+        };
+
+        is_max_value_lut->generate_and_broadcast_lut(
+            active_streams, {1}, {is_equal_to_num_blocks_lut_f}, index_gen,
+            true, {are_all_block_true_buffer->preallocated_h_lut});
      }
      lut = is_max_value_lut;
    }
@@ -167,15 +165,10 @@ __host__ void are_all_comparisons_block_true(
          streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
      // Reset max_value_lut_indexes before returning, otherwise if the lut is
      // reused the lut indexes will be wrong
-      memset(is_max_value_lut->h_lut_indexes, 0,
-             is_max_value_lut->num_blocks * sizeof(Torus));
-      cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
-                               is_max_value_lut->h_lut_indexes,
-                               is_max_value_lut->num_blocks * sizeof(Torus),
-                               streams.stream(0), streams.gpu_index(0));
      auto active_gpu_count_is_max = streams.active_gpu_subset(
          is_max_value_lut->num_blocks, params.pbs_type);
-      is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);
+      is_max_value_lut->set_lut_indexes_and_broadcast_constant(
+          active_gpu_count_is_max, 0);

      reset_radix_ciphertext_blocks(lwe_array_out, 1);
      return;
@@ -222,7 +215,7 @@ __host__ void is_at_least_one_comparisons_block_true(
  uint32_t remaining_blocks = num_radix_blocks;
  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -483,14 +476,11 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    y = x;
    f = sign_handler_f;
  }
-  generate_device_accumulator_with_cpu_prealloc<Torus>(
-      streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
-      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
-      polynomial_size, message_modulus, carry_modulus, f, true,
-      tree_buffer->preallocated_h_lut);

  auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-  last_lut->broadcast_lut(active_streams);
+  last_lut->generate_and_broadcast_lut(active_streams, {0}, {f},
+                                       LUT_0_FOR_ALL_BLOCKS, true,
+                                       {tree_buffer->preallocated_h_lut});

  // Last leaf
  integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -10,7 +10,122 @@
 #include "integer/integer.cuh"
 #include "linearalgebra/multiplication.cuh"
 #include "polynomial/functions.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
+
+/*
+ * =============================================================================
+ * GPU Compression/Decompression Algorithm: Overview
+ * =============================================================================
+ *
+ * The compression algorithm transforms standard LWE ciphertexts into a compact
+ * packed format. Decompression reverses this process.
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION INPUT (lwe_array_in)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                    lwe_array_in (GPU memory)                            |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------+---------------------------+-----------------+
+ *  |          LWE 0            |          LWE 1            |      ...        |
+ *  |      [mask, body]         |      [mask, body]         |                 |
+ *  +---------------------------+---------------------------+-----------------+
+ *  |<-- lwe_dimension + 1 -->|
+ *
+ *  Total LWEs: total_lwe_bodies_count (num_radix_blocks)
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION PROCESS
+ * -----------------------------------------------------------------------------
+ *
+ * 1. Message Shift (64-bit only):
+ *    Each LWE is multiplied by message_modulus to shift the message to MSB
+ *
+ * 2. Packing Keyswitch (LWE -> GLWE):
+ *    Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
+ *
+ *    +--------------------------------------------------------------+
+ *    |   lwe_per_glwe LWEs (input batch)                            |
+ *    |   LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1]                   |
+ *    +--------------------------------------------------------------+
+ *                              |
+ *                    Packing Keyswitch
+ *                              v
+ *    +--------------------------------------------------------------+
+ *    |            Single GLWE Ciphertext                            |
+ *    |   [A_0, A_1, ..., A_{k-1}, B]                                |
+ *    |   |<-- k * polynomial_size -->| |<-- polynomial_size -->|   |
+ *    +--------------------------------------------------------------+
+ *
+ *    Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
+ *                                             lwe_per_glwe)
+ *
+ * 3. Modulus Switch:
+ *    Reduce precision from 64-bit torus to storage_log_modulus bits
+ *
+ * 4. Bit Packing:
+ *    Pack multiple reduced-precision elements into dense bit representation
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                 tmp_glwe_array_out (intermediate buffer)                |
+ *  +-------------------------------------------------------------------------+
+ *  +----------------------------+----------------------------+---------------+
+ *  |         GLWE 0             |         GLWE 1             |    ...        |
+ *  |  [A_0..A_{k-1}, B_0..B_N]  |  [A_0..A_{k-1}, B_0..B_N]  |               |
+ *  +----------------------------+----------------------------+---------------+
+ *       |<-- glwe_accumulator_size = (k+1)*N -->|
+ *
+ *  Total size needed: num_glwes * glwe_accumulator_size elements
+ *  Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
+ *
+ * -----------------------------------------------------------------------------
+ * PACKED OUTPUT (glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |              Packed GLWE Ciphertext List (bit-packed)                   |
+ *  +-------------------------------------------------------------------------+
+ *  +-------------------------------------------------------------------------+
+ *  |  Elements packed with storage_log_modulus bits per original element    |
+ *  |  Total packed size: ceil(in_len * storage_log_modulus / 64) elements   |
+ *  +-------------------------------------------------------------------------+
+ *
+ * =============================================================================
+ * DECOMPRESSION (Extract) Algorithm
+ * =============================================================================
+ *
+ * Decompression receives an array of LWE indexes. For each index, it identifies
+ * the corresponding GLWE, extracts that GLWE from the packed representation,
+ * and then sample-extracts the requested LWE from the GLWE.
+ *
+ * -----------------------------------------------------------------------------
+ * EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |               Extracted GLWE Ciphertext                                 |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------------------+-----------------+---------------+
+ *  |    Mask (A polynomials)               |   Body (B)      |    Tail       |
+ *  |    [A_0, ..., A_{k-1}]                |   (body_count)  |   (zeroed)    |
+ *  |    k * polynomial_size elements       |   elements      |   elements    |
+ *  +---------------------------------------+-----------------+---------------+
+ *  |<------------------- initial_out_len ------------------->|               |
+ *  |<------------------------ glwe_ciphertext_size ------------------------->|
+ *
+ *  For the last GLWE, body_count may be less than polynomial_size (partial).
+ *  The tail region must be zeroed to ensure defined behavior.
+ *
+ *  tail_size = glwe_ciphertext_size - initial_out_len
+ *  tail_offset = initial_out_len  (NOT 0!)
+ *
+ * =============================================================================
+ */

 template <typename Torus>
 __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
@@ -66,7 +181,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,

  // number_bits_to_pack.div_ceil(Scalar::BITS)
  auto nbits = sizeof(Torus) * 8;
-  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
+  auto out_len = CEIL_DIV(number_bits_to_pack, nbits);

  int num_blocks = 0, num_threads = 0;
  getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
@@ -108,6 +223,8 @@ host_integer_compress(CudaStreams streams,
  uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
                        glwe_array_out->lwe_per_glwe - 1) /
                       glwe_array_out->lwe_per_glwe;
+  PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
+                 "Invalid number of GLWEs");

  // Keyswitch LWEs to GLWE
  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -200,8 +317,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;

-  uint32_t num_glwes =
-      (total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
+  uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);

  // Compressed length of the compressed GLWE we want to extract
  uint32_t body_count = 0;
@@ -218,19 +334,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;

-  // Calculates how many bits this particular GLWE shall use
-  auto number_bits_to_unpack = initial_out_len * log_modulus;
  auto nbits = sizeof(Torus) * 8;

-  // Calculates how many bits a full-packed GLWE shall use
-  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
-  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
+  // Calculate how many bits a full-packed GLWE uses, to determine
+  // the stride between consecutive packed GLWEs in the input buffer
+  auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
+  auto len = CEIL_DIV(number_bits_to_unpack, nbits);
  // Uses that length to set the input pointer
  auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;

  // Ensure the tail of the GLWE is zeroed
+  // The extract kernel writes initial_out_len elements starting at offset 0.
+  // We must zero the tail region (from initial_out_len to
+  // glwe_ciphertext_size)
  if (initial_out_len < glwe_ciphertext_size) {
-    cuda_memset_async(glwe_array_out, 0,
+    cuda_memset_async(glwe_array_out + initial_out_len, 0,
                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
                      stream, gpu_index);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -14,7 +14,6 @@
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <algorithm>
 #include <functional>

@@ -273,8 +272,7 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
      }
    }

-    if ((blockIdx.x / group_size + 1) <
-        (blocks_count + group_size - 1) / group_size) {
+    if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
      size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
      size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
@@ -363,7 +361,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
      num_radix_blocks > src1->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks should have more "
          "blocks than the number used in sum in groups")
-  auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
+  auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
  if (src2->num_radix_blocks < num_groups)
    PANIC("Cuda error: second input in sum in groups should have at least "
          "num_groups blocks")
@@ -544,6 +542,24 @@ __host__ void integer_radix_apply_univariate_lookup_table(

  auto active_streams =
      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+
+  // Verify consistency between set_lut_indexes and apply_lookup_table
+  GPU_ASSERT(
+      num_radix_blocks <= lut->last_broadcast_num_radix_blocks,
+      "num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
+      num_radix_blocks, lut->last_broadcast_num_radix_blocks);
+  GPU_ASSERT(active_streams.count() <= lut->last_broadcast_streams.count(),
+             "active_streams count (%u) must match last_broadcast_streams "
+             "count (%u)",
+             active_streams.count(), lut->last_broadcast_streams.count());
+  for (uint32_t i = 0; i < active_streams.count(); i++) {
+    GPU_ASSERT(active_streams.gpu_index(i) ==
+                   lut->last_broadcast_streams.gpu_index(i),
+               "active_streams gpu_index(%u) = %u must match "
+               "last_broadcast_streams gpu_index(%u) = %u",
+               i, active_streams.gpu_index(i), i,
+               lut->last_broadcast_streams.gpu_index(i));
+  }
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -962,8 +978,9 @@ uint64_t generate_many_lookup_table(
 template <typename Torus>
 void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,
                                       uint32_t polynomial_size,
-                                       std::function<Torus(uint32_t)> f) {
+                                       std::function<Torus(Torus)> f) {

+  // accumulator number of elements is (glwe_dimension + 1) * polynomial_size
  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

  auto body = &acc[glwe_dimension * polynomial_size];
@@ -975,9 +992,9 @@ void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,

 template <typename Torus>
 void generate_device_accumulator_no_encoding(
-    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t &degree,
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
+    uint32_t polynomial_size, std::function<Torus(Torus)> f,
    bool gpu_memory_allocated) {

  Torus *h_lut =
@@ -986,7 +1003,7 @@ void generate_device_accumulator_no_encoding(
  generate_lookup_table_no_encoding<Torus>(h_lut, glwe_dimension,
                                           polynomial_size, f);

-  degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;
+  *degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;

  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -1738,12 +1755,9 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
      signs_array_in, 0, num_sign_blocks);
  if (num_sign_blocks > 2) {
    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
-        reduce_two_orderings_function, true, diff_buffer->preallocated_h_lut1);
-    lut->broadcast_lut(lut->active_streams);
+    lut->generate_and_broadcast_lut(
+        lut->active_streams, {0}, {reduce_two_orderings_function},
+        LUT_0_FOR_ALL_BLOCKS, true, {diff_buffer->preallocated_h_lut1});

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
@@ -1769,12 +1783,10 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
-        diff_buffer->preallocated_h_lut2);
-    lut->broadcast_lut(lut->active_streams);
+
+    lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
+                                    LUT_0_FOR_ALL_BLOCKS, true,
+                                    {diff_buffer->preallocated_h_lut2});

    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
                       signs_a, num_sign_blocks, message_modulus);
@@ -1789,12 +1801,9 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    };

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
-    generate_device_accumulator_with_cpu_prealloc<Torus>(
-        streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
-        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
-        diff_buffer->preallocated_h_lut2);
-    lut->broadcast_lut(lut->active_streams);
+    lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
+                                    LUT_0_FOR_ALL_BLOCKS, true,
+                                    {diff_buffer->preallocated_h_lut2});

    integer_radix_apply_univariate_lookup_table<Torus>(
        streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
@@ -2344,7 +2353,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,

  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
  // from the output ct
-  auto active_streams = streams.active_gpu_subset(
+  auto active_streams = streams.active_gpu_subset_u128(
      lwe_array_out->num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<InputTorus>(
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Agnes Leroy	d0fdaeea75	Fix Makefile	2026-02-13 09:49:21 +01:00
Agnes Leroy	077c063c20	Add cost for 8xL40	2026-02-13 09:01:10 +01:00
Agnes Leroy	565b903534	Add decomp_sns_comp bench to summary	2026-02-12 17:44:24 +01:00
Agnes Leroy	0aaae99b56	Add 8xL40 to slab config	2026-02-12 16:34:03 +01:00
Agnes Leroy	7bdd0adeff	Fix Makefile	2026-02-12 15:05:00 +01:00
Guillermo Oyarzun	d8d155e484	feat(gpu): implement specialized pbs for any architecture	2026-02-12 11:46:14 +01:00
Agnes Leroy	3169ee8093	chore(gpu): bench classic or multi-bit params for compress/decompress	2026-02-12 11:45:44 +01:00
David Testé	8119c2287b	WIP: run classic an multi_bit benchs on gpu	2026-02-12 11:42:48 +01:00
David Testé	e4c7f83e17	WIP: use placeholder workflow to run benches	2026-02-10 16:09:26 +01:00
David Testé	349846bc11	WIP: add profile selection for GPU bench	2026-02-10 16:09:25 +01:00
David Testé	b1fb4b2ae0	WIP: adding missing DEX bench	2026-02-10 16:09:25 +01:00
David Testé	da3c55c50b	WIP: implement bench workflow testing cpu and gpu	2026-02-10 16:09:22 +01:00
Agnes Leroy	7593093d8f	chore(gpu): add classic params to zk and sns benches	2026-02-10 16:07:00 +01:00
David Testé	344ea55426	chore(bench): allow result parsing of kv store benchmarks Naming has been reworked to stick with the future naming standard. Call to write_to_json() has been added to be able to parse the results and send them to database.	2026-02-10 16:02:27 +01:00
David Testé	53de52c9fd	chore(bench): move kv store benchmarks to their own file	2026-02-10 16:02:27 +01:00
Arthur Meyre	14870536e4	chore: upate webpack to 5.105.0 which is signed and has provenance	2026-02-10 14:38:22 +01:00
Arthur Meyre	35cc35bd96	chore: revert backward compatibility change - this change should not have been needed and poses risks for backward compatibility - HL CUDA: use dedicated type for the ReRand	2026-02-10 14:25:25 +01:00
Guillermo Oyarzun	20403d6325	fix(gpu): avoid unspecified behavior during the reduction	2026-02-10 11:33:40 +01:00
Andrei Stoian	71f52b2853	fix(gpu): more refactor	2026-02-09 22:32:57 +01:00
Andrei Stoian	d83e57f29a	fix(gpu): encapsulate lut generation	2026-02-09 22:32:57 +01:00
Agnes Leroy	9592445bd8	fix(gpu): revert changes in compression This reverts commit `91a927e09e` and `7fac0bf3b2`.	2026-02-09 17:04:17 +01:00
David Testé	035285fcbe	chore(ci): rollback slsa-github-generator action version pinning According to the action documentation, pinning to a commit cannot be done yet.	2026-02-09 10:20:56 +01:00
David Testé	650f8a400f	chore(ci): add runs-on configuration file This is done before migrating the CI running on AWS to avoid inline runs-on configuration in each workflow file.	2026-02-09 09:35:29 +01:00
Pedro Alves	7fac0bf3b2	fix(gpu): use correct lwe_per_glwe value in compression metadata	2026-02-08 12:14:07 -03:00
Pedro Alves	91a927e09e	fix(gpu): add lwe_per_glwe consistency checks between scratch and host functions	2026-02-08 12:14:07 -03:00
Pedro Alves	87cf5dd8a0	fix(gpu): remove unused number_bits_to_unpack initialization in host_extract	2026-02-08 12:14:07 -03:00
Pedro Alves	52d90e3c62	fix(gpu): assert all items have same shape in from_vec_cuda_lwe_ciphertexts_list	2026-02-08 12:14:07 -03:00
Pedro Alves	4abaf92dbd	fix(gpu): assert exactly one element in CudaLweCiphertextList::into_lwe_ciphertext	2026-02-08 12:14:07 -03:00
Pedro Alves	140d27f11d	fix(gpu): use lwe_compact_ciphertext_list_size for validation in CudaLweCompactCiphertextList::from_d_vec	2026-02-08 12:14:07 -03:00
Agnes Leroy	cd4f677248	chore(gpu): improve checks in device code	2026-02-06 18:31:29 +01:00
Agnes Leroy	6ad8f30e3f	fix(gpu): fix logic in lwe ciphertext list	2026-02-06 18:18:14 +01:00
David Testé	58f075b669	chore(ci): run workflows on push only on public repository	2026-02-06 18:00:55 +01:00
David Testé	4145497a47	chore(ci): fix parameters check workflow setup condition Prior to this, parameters_check workflow could never run on pull-request event even if one the paths were matched.	2026-02-06 18:00:55 +01:00
Pedro Alves	c939687351	chore(gpu): replaces (a + b - 1) / b patterns in the cuda backend by CEIL_DIV - also, this commit renames kernel_dimensions.cuh to helper.cuh and copies the content of the older helper.cuh into helper_debug.cuh	2026-02-06 16:48:25 +01:00
Agnes Leroy	316c345d0a	chore(gpu): add some missing checks in core crypto	2026-02-06 15:49:17 +01:00
Agnes Leroy	dcb0f892ef	chore(gpu): cleanup device.cu binding, remove _async fuinctions from core crypto	2026-02-06 15:49:17 +01:00
Thomas Montaigu	4e1ab7f769	chore: move shortint expanded types into shortint mod The expanded types definitions were in the high level API as it was originally related to the XofKeySet feature. However, since it's now used even in non-xof setting we decided to move these types to shortint module where they conceptually belong	2026-02-06 15:49:01 +01:00
Thomas Montaigu	b8843352a5	feat(hlapi): add is_conformant for CompressedXofKeySet	2026-02-06 15:48:42 +01:00
Thomas Montaigu	8f9571dc64	fix(xofkeyset): generate multibit decompression key when params are multibit	2026-02-06 15:48:42 +01:00
Thomas Montaigu	1d7c7dfa98	fix(conformance): GGSW list had wrong group count	2026-02-06 15:48:42 +01:00
Thomas Montaigu	b3029d7296	chore(xof_key_set): make generate_with_pre_seeded_generator public MPC teams needs to be able to generate a CompressedXofKeySet from an existing ClientKey	2026-02-06 13:29:03 +01:00
Agnes Leroy	b4c8f782c4	chore(gpu): add fallback for 4-l40 in CI	2026-02-06 12:00:25 +01:00
David Testé	94fb1c61fe	doc: fix specs description of aws hpc7a.96xlarge instance These instances have two sockets, each equipped with a 96-core CPU.	2026-02-06 10:50:20 +01:00
Guillermo Oyarzun	9a870652dd	fix(gpu): clean unused variables in specialized classical pbs	2026-02-06 09:59:07 +01:00
David Testé	b71799de2f	chore(ci): add svg generation for erc20 benchmarks This commit introduces the concept of a benchmark subset in the data_extractor. This allows a user to fetch only part of the benchmark results on a given layer. For now only HLAPI ERC20 benchmarks handling is implemented. Also, the benchmark type 'both' has been added. It allows a user to fetch both latency and throughput results in the database. This is used in ERC20 SVG generation to display these two benchmark types within the same table.	2026-02-05 18:24:12 +01:00
Agnes Leroy	869af08f1e	chore(gpu): make valgrind run weekly	2026-02-05 17:46:10 +01:00
Agnes Leroy	4a2eac2990	chore(gpu): extend time for 4090 tests now that tests take longer	2026-02-05 16:51:52 +01:00
David Testé	bbe62324fa	chore(ci): pin slsa-github-generator to a specific commit Zizmor analysis didn't pick up this non-pinned action usage. It's now pinned to the commit of the v2.1.0 of the action.	2026-02-05 16:11:14 +01:00
Andrei Stoian	bdc5d8597e	fix(gpu): valgrind improve error return	2026-02-05 13:55:45 +01:00
Andrei Stoian	cba7f17c12	fix(gpu): deliberate leak	2026-02-05 13:55:45 +01:00
Andrei Stoian	74bbdf9038	fix(gpu): valgrind error on leaks	2026-02-05 13:55:45 +01:00
Guillermo Oyarzun	7da02520dd	feat(gpu): create different threshold for multi-gpu pbs128	2026-02-05 13:18:30 +01:00
Theo Souchon	0398dccf29	chore(bench): add dedup operations to reduce ci time	2026-02-05 13:07:56 +01:00
Theo Souchon	e3ad38d077	chore(bench): add throughput mode on hlapi operations	2026-02-05 13:07:56 +01:00
Guillermo Oyarzun	c471c3f687	fix(gpu): fix race condition in tbc implementations	2026-02-05 12:44:50 +01:00
David Testé	e40e127393	chore(bench): add fast benchmark capability to integer zk Now, one can run a fast ZK benchmark. This would run only the compute load 'verify' with 64bit and 256bit to prove using a 2048bit CRS. Fast benchmark can be triggered by using the make environment variable 'BENCH_OP_FLAVOR' with the value 'fast_default' or 'fast'.	2026-02-05 11:52:32 +01:00
Agnes Leroy	9ad43d62c3	chore(gpu): split fast h100 workflow into core and hlapi	2026-02-05 11:46:41 +01:00
Guillermo Oyarzun	8daccd1d85	feat(gpu): avoid register spilling memory in ff128	2026-02-04 18:08:42 +01:00
Guillermo Oyarzun	ed117630e8	feat(gpu): use 512 threads for pbs128 flavors	2026-02-04 18:08:42 +01:00
Enzo Di Maria	267ad280f2	feat(gpu): kreyvium	2026-02-04 15:14:55 +01:00
Agnes Leroy	9a556bc517	chore(gpu): reduce hl test time in CI	2026-02-04 13:54:37 +01:00
Agnes Leroy	5d651c0639	fix(gpu): fix small cpu memory leak	2026-02-04 11:42:00 +01:00
Thomas Montaigu	96b7c6ee60	refactor(csprng): move Seed backward compat in csprng XofSeed had its backward_compatibility in the csprng crate where it originates from. The Seed type did not, and core_crypto had to do extra work to have this backward compatibility. So we move the backward compatibility into csprng for consistency. Also, this removes a wrong re-export of serde as there was ``` // generator.rs pub mod serialization_proxy { pub(crate) use serde::{Deserialize, Serialize}; // ... } // random/mod.rs pub use generator::*; ``` which lead to Deserialize, Serialize being re-export crate wise and some other files of the lib imported these traits from here instead of serde	2026-02-04 09:37:59 +01:00
Thomas Montaigu	0340869347	feat(hlapi): add decompress_to_gpu for CompressedXofKeySet Done using the newly added expand + convert pattern	2026-02-04 09:36:32 +01:00
Thomas Montaigu	daff533de4	refactor(hlapi): add IntegerExpandedServerKey::convert_to_gpu And use it to convert from CompressedServerKey to CudaServerKey.	2026-02-04 09:36:32 +01:00
Thomas Montaigu	155c575bd9	refactor(hlapi): split gpu key conversion in expand/convert converting from CompressedServerKey (Cpu) to CudaServerKey was done via decompress_from_cpu/decompress_to_cuda methods. We refactor to split these functions in 2: one that converts from cpu (input in std domain for bootstrap keys), the other that decompress/expand then calls convert	2026-02-04 09:36:32 +01:00
Thomas Montaigu	e942c22bc1	refactor(hlapi): add expand() method to CompressedServerKey Add an expand method to CompressedServerKey that returns an IntegerExpandedServerKey. Refactor decompress() to use the new expand() then convert_to_cpu(). This will allow later to refactor the convertion from CompressedServerKey to CudaServerKey to follow a similar pattern, meaning we will be able to share the code that converts keys from CPU to GPU between the normal server key and the xof server key	2026-02-04 09:36:32 +01:00
Thomas Montaigu	e620768e3c	fix(hlapi-gpu): correct state of post decompression noise squashed ct The GPU part did not set the correct state This problem is sort of rare to encounter as generally a CompressedSquashedNoiseCiphertextList is deserialized then used. When deserialized its on the CPU, so calls to `get` use CPU code which correctly set the state. This problem is thus visible when either: - safe_deserializing and manually moving the list to GPU - deserialize - directly expanding after the creation of the list	2026-02-04 09:36:32 +01:00
Thomas Montaigu	ebd6c06d7b	fix(hlapi): return error when trying to expand non packed list	2026-02-04 09:36:32 +01:00
David Testé	fb630d5cd2	chore(bench): use tfhe-benchmark id format pattern in wasm This brings consistency across benchmark ID naming and eases work for data consumers like data_extractor.	2026-02-03 14:24:11 +01:00
Agnes Leroy	c2bcb10702	fix(gpu): fix noise level in match value	2026-02-03 09:05:06 +01:00
David Testé	a1f17f4a00	doc: add svg tables to zero-knowledge benchmarks This removes the embedded GSheet. These SVGs display more operations and inputs that reflect real-use cases. Throughput is available only for server-side computation as it's meaningless to perform multiple proof in parallele on the client-side.	2026-02-02 15:59:33 +01:00
Andrei Stoian	8f35a3a3d6	chore(gpu): refactor lut generation	2026-02-02 15:01:41 +01:00
Beka Barbakadze	2f111bc413	feat(gpu): add bit-wise consistency test for forward FFT128	2026-02-02 17:06:14 +04:00
dependabot[bot]	ee9a95f8dd	chore(deps): bump actions/setup-python from 6.1.0 to 6.2.0 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 6.1.0 to 6.2.0. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](`83679a892e...a309ff8b42`) --- updated-dependencies: - dependency-name: actions/setup-python dependency-version: 6.2.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2026-02-02 11:41:04 +01:00
dependabot[bot]	16bdbeb0ea	chore(deps): bump zizmorcore/zizmor-action from 0.3.0 to 0.4.1 Bumps [zizmorcore/zizmor-action](https://github.com/zizmorcore/zizmor-action) from 0.3.0 to 0.4.1. - [Release notes](https://github.com/zizmorcore/zizmor-action/releases) - [Commits](`e639db9933...135698455d`) --- updated-dependencies: - dependency-name: zizmorcore/zizmor-action dependency-version: 0.4.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2026-02-02 11:40:54 +01:00
dependabot[bot]	75667b79ca	chore(deps): bump JS-DevTools/npm-publish from 4.1.3 to 4.1.4 Bumps [JS-DevTools/npm-publish](https://github.com/js-devtools/npm-publish) from 4.1.3 to 4.1.4. - [Release notes](https://github.com/js-devtools/npm-publish/releases) - [Changelog](https://github.com/JS-DevTools/npm-publish/blob/main/CHANGELOG.md) - [Commits](`d2fef917d9...4ce4bd0f33`) --- updated-dependencies: - dependency-name: JS-DevTools/npm-publish dependency-version: 4.1.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-02-02 11:40:40 +01:00
dependabot[bot]	8b7f8ead23	chore(deps): bump foundry-rs/foundry-toolchain from 1.6.0 to 1.7.0 Bumps [foundry-rs/foundry-toolchain](https://github.com/foundry-rs/foundry-toolchain) from 1.6.0 to 1.7.0. - [Release notes](https://github.com/foundry-rs/foundry-toolchain/releases) - [Changelog](https://github.com/foundry-rs/foundry-toolchain/blob/master/RELEASE.md) - [Commits](`8b0419c685...8789b3e21e`) --- updated-dependencies: - dependency-name: foundry-rs/foundry-toolchain dependency-version: 1.7.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2026-02-02 11:40:25 +01:00
David Testé	03aa3ddec2	chore(ci): remove pull-request opening option in workflow In case of documentation benchmarks results, we want the upload to be manual.	2026-02-02 11:38:42 +01:00
Mayeul@Zama	40a439620f	chore(core): remove PlanWrapper	2026-02-02 09:42:13 +01:00
Mayeul@Zama	0fca1796f3	feat(core): add GenericPlanMap::set	2026-02-02 09:42:13 +01:00
Mayeul@Zama	3fb5eb7a21	refactor(core): use typed PolynomialSize for Plan	2026-02-02 09:42:13 +01:00
Mayeul@Zama	d1f77b23f3	refactor(core): simplify PlanMap	2026-02-02 09:42:13 +01:00
Mayeul@Zama	e75bca1b1d	refactor(core): factorize plan map management	2026-02-02 09:42:13 +01:00
Guillermo Oyarzun	aa1e9ffdfa	chore(gpu): avoid running gpu noise in integer tests	2026-02-02 09:24:04 +01:00
Theo Souchon	45d76cf079	chore(bench): remove bitnot from hlapi bench because already tested	2026-01-30 15:36:08 +01:00
Beka Barbakadze	f4713ceeaa	fix(gpu): add __syncthread and threadIdx condition for sample_extract_body in all pbs versions	2026-01-30 15:31:53 +01:00
Pedro Alves	600532e8f7	chore(gpu): add test for single-item compact ciphertext list expand	2026-01-30 09:37:58 +01:00
Andrei Stoian	7bfb236543	fix(gpu): more crypto param checks in cuda backend	2026-01-30 09:32:21 +01:00
Beka Barbakadze	7c0ecf48f2	refactor(gpu): refactor f128 and fft128 to prevent possible precision losses and improve performance	2026-01-30 08:55:22 +01:00
Pedro Alves	66b357d869	fix(gpu): fix some inconsistencies in decompression that could enable access of not allocated memory - also adds a diagram explaining how compression / decompression work	2026-01-29 16:35:03 -03:00
Agnes Leroy	b7786afaf5	fix(gpu): add check on nullptr for dest_indexes in many lut gather	2026-01-29 10:08:06 +01:00
Guillermo Oyarzun	84931c420f	fix(gpu): handling temporary events destruction	2026-01-28 15:46:08 +01:00
Pedro Alves	93201d6afb	fix(gpu): fix an invalid access in expand when the number of LWEs is odd - also improves test_expander_length_matches_data_items - adds diagrams and explanations about GPU's expand	2026-01-28 14:32:56 +01:00
Arthur Meyre	1f6681ec39	chore: remove milestone reminde on PRs	2026-01-28 13:03:53 +01:00
Theo Souchon	31a1e977b2	fix(wasm): benchmark compilation for wasm	2026-01-27 15:45:38 +01:00
David Testé	2cfa0f74d1	chore(bench): use standard separator for zk_pke benchmark ids Parameters and benchmark case configuration must be separated with a '::' instead of an underscore to ease parsing by automated tools like data_extractor.	2026-01-27 15:07:06 +01:00
David Testé	17fd3e9db5	chore(bench): add layer name to zk_pke benchmark ids Layer name 'integer' was missing in benchmark IDs. This made it difficult for automated tools like data_extractor to handle ZK benchmark results.	2026-01-27 15:07:06 +01:00
David Testé	59646a76e4	chore(ci): ensure no credentials are leaked in action log When using direct git command, credentials are exposed in the console logs. Despite the fact GitHub is redacting its secrets, adding --quiet flag ensures that, even if this redaction feature is flawed, we don't leak secrets in the action log. To go further, we also shred the local git configuration file to remove any trace of the remote branch that contains crendentials.	2026-01-27 15:06:48 +01:00
Theo Souchon	406055671b	chore(bench): add missing operation in hlapi benches	2026-01-27 11:23:13 +01:00
David Testé	87bb4d99d3	chore(ci): update slab-github-runner action to v1.5.0 This new version improves handling of asynchronous tasks related to Slab and GitHub API.	2026-01-27 10:36:04 +01:00
Guillermo Oyarzun	9f9b54dcb8	fix(gpu): add panic for 32-bit Torus calls	2026-01-27 09:42:11 +01:00
Agnes Leroy	a8a796de6c	chore(gpu): fix logic to check ptr validity in device.cu	2026-01-27 09:19:48 +01:00
Agnes Leroy	7b4093b572	chore(gpu): stop trying to enable NVlink since we don't use it	2026-01-27 09:19:40 +01:00
Thomas Montaigu	f52eb16581	refactor(xof_key_set): split decompression into expansion and conversion Introduce IntegerExpandedServerKey as an intermediate representation between compressed (seeded) keys and backend-specific formats. Decompression is now a two-step process: 1. Seed expansion: decompress seeded keys into standard domain representations (e.g., LweBootstrapKey instead of FourierLweBootstrapKey) 2. Backend conversion: convert to target backend format (CPU Fourier, GPU, etc.) This separation allows sharing the expansion step across backends while specializing only the final conversion, as for this XOF based expansion the order is important Changes: - Split xof_key_set.rs into module structure (mod.rs, internal.rs, test.rs) - Add intermediate types, that contains the expanded, but not converted data	2026-01-26 18:53:53 +01:00
dependabot[bot]	96622506c5	chore(deps): bump actions/checkout from 6.0.1 to 6.0.2 Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.1 to 6.0.2. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](`8e8c483db8...de0fac2e45`) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: 6.0.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2026-01-26 17:49:17 +01:00
David Testé	ce73b934b2	chore(bench): add params type selection from env variable on gpu Prior to this, multi-bit integer benchmarks on GPU could only be launched from the make recipe 'bench_[signed_]integer_multi_bit'. Adding the parameters selection to 'bench_[signed_]integer_gpu' allows benchmark workflows to work as they are designed.	2026-01-26 17:08:30 +01:00