chore(ci): update slab-github-runner action to v1.5.0

This new version improves handling of asynchronous tasks related to Slab and GitHub API.
chore(gpu): add fallback for 4-l40 in CI
2026-04-28 03:01:21 -04:00 · 2026-02-11 13:59:49 +01:00 · 2026-02-11 13:59:49 +01:00 · 2026-02-11 13:59:49 +01:00 · 2026-02-11 13:59:49 +01:00 · 2026-02-11 13:59:49 +01:00
268 changed files with 7247 additions and 8342 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -41,7 +41,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -141,7 +141,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -146,7 +146,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -299,7 +299,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -86,7 +86,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -87,7 +87,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -155,7 +155,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -39,7 +39,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -134,7 +134,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -92,7 +92,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_hpu_common.yml
+++ b/.github/workflows/benchmark_hpu_common.yml
@@ -187,7 +187,7 @@ jobs:
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
-          name: ${{ github.sha }}_${{ matrix.bench_type }}_${{ matrix.command }}_benchmarks
+          name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -383,7 +383,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -71,7 +71,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -213,7 +213,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -60,7 +60,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -31,7 +31,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -37,7 +37,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -198,7 +198,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -79,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -187,7 +187,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -62,7 +62,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -34,7 +34,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -221,7 +221,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/pr_milestone_check.yml
+++ b/.github/workflows/pr_milestone_check.yml
@@ -1,67 +0,0 @@
-name: pr_milestone_check
-
-on:
-  pull_request:
-    types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
-
-permissions: {}
-
-# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
-# external contributors workflows are manually approved
-
-jobs:
-  check-empty-milestone:
-    name: pr_milestone_check/check-empty-milestone
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone == null
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone Missing
-
-            Please assign a milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is missing. This check is failing."
-          exit 1
-
-  check-milestone-open:
-    name: pr_milestone_check/check-milestone-open
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
-    permissions:
-      pull-requests: write # Need write access on pull requests to post comment
-
-    steps:
-      - name: Post Reminder Comment
-        uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
-        with:
-          route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
-          body: |
-            '### ❌ Milestone is closed
-
-            Please assign an open milestone to this pull request. If your PR targets the next version of
-            TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
-
-            If your PR targets a patch version for previous releases: consider creating a dedicated
-            milestone e.g. v1.5.1 if it does not exist yet.'
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Check Final Status
-        run: |
-          echo "::error::Milestone is closed. This check is failing."
-          exit 1
--- a/7
+++ b/7
@@ -1454,13 +1454,6 @@ bench_integer_aes256_gpu: install_rs_check_toolchain
 	--bench integer-aes256 \
 	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --

-.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
-bench_integer_trivium_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-trivium \
-	--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
-
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
 ```

 > [!Note]
-> Note: You need Rust version 1.91.1 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
+> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.

 > [!Note]
 > Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
--- a/apps/test-vectors/data/README.md
+++ b/apps/test-vectors/data/README.md
@@ -1,43 +1,43 @@
 # Test vectors for TFHE
 These test vectors are generated using [TFHE-rs](https://github.com/zama-ai/tfhe-rs), with the git tag `tfhe-test-vectors-0.2.0`.

-They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). These can be deserialized using any cbor library for any programming languages. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, the command to run is: `cbor2 --pretty toy_params/lwe_a.cbor`.
+They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). You can deserialize them using any cbor library for the language of your choice. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, run: `cbor2 --pretty toy_params/lwe_a.cbor`.

-There are 2 folders with test vectors for different parameter sets:
- `valid_params_128`: valid classical PBS parameters using a Gaussian noise distribution, providing 128-bits of security in the IND-CPA model (i.e., the probability of failure is smaller than 2^{-64}).
- `toy_params`: insecure parameters that yield smaller values to simplify the bit comparison of the results.
+You will find 2 folders with test vectors for different parameter sets:
+- `valid_params_128`: valid classical PBS parameters using a gaussian noise distribution, providing 128bits of security in the IND-CPA model and a bootstrapping probability of failure of 2^{-64}.
+- `toy_params`: insecure parameters that yield smaller values

-The values are generated to compute a keyswitch (KS) followed by a bootstrap (PBS). The cleartext inputs are 2 values, A and B defined below.
+The values are generated for the keyswitch -> bootstrap (KS-PBS) atomic pattern. The cleartext inputs are 2 values, A and B defined below.

 All the random values are generated from a fixed seed, that can be found in the `RAND_SEED` constant below. The PRNG used is the one based on the AES block cipher in counter mode, from tfhe `tfhe-csprng` crate.

-The bootstrap is applied twice, with 2 different lut, the identity lut and a specific one computing the double of the input value (i.e., f(x) = 2*x).
+The programmable bootstrap is applied twice, with 2 different lut, the identity lut and a specific one (currently a x2 operation)

 ## Vectors
 The following values are generated:

 ### Keys
-| name                   | description                                                                             | TFHE-rs type                |
-|------------------------|-----------------------------------------------------------------------------------------|-----------------------------|
-| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS                                  | `LweSecretKey<Vec<u64>>`    |
-| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS                            | `LweSecretKey<Vec<u64>>`    |
-| `ksk`                  | The keyswitching key to convert a ct from the large key to the small one                | `LweKeyswitchKey<Vec<u64>>` |
+| name                   | description                                                                           | TFHE-rs type                |
+|------------------------|---------------------------------------------------------------------------------------|-----------------------------|
+| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS                                | `LweSecretKey<Vec<u64>>`    |
+| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS                          | `LweSecretKey<Vec<u64>>`    |
+| `ksk`                  | The keyswitching key to convert a ct from the large key to the small one              | `LweKeyswitchKey<Vec<u64>>` |
 | `bsk`                  | the bootstrapping key to perform a programmable bootstrap on the keyswitched ciphertext | `LweBootstrapKey<Vec<u64>>` |


 ### Ciphertexts
-| name                 | description                                                                                         | TFHE-rs type               | Cleartext            |
-|----------------------|-----------------------------------------------------------------------------------------------------|----------------------------|----------------------|
-| `lwe_a`              | LWE Ciphertext encrypting A                                                                         | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `lwe_b`              | LWE Ciphertext encrypting B                                                                         | `LweCiphertext<Vec<u64>>`  | `B`                  |
-| `lwe_sum`            | LWE Ciphertext encrypting A plus lwe encryption of B                                                | `LweCiphertext<Vec<u64>>`  | `A+B`                |
-| `lwe_prod`           | LWE Ciphertext encrypting A times cleartext B                                                       | `LweCiphertext<Vec<u64>>`  | `A*B`                |
-| `lwe_ms`             | LWE Ciphertext encrypting A after a Modulus Switch from q to 2*N ([note](#non-native-encoding))     | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `lwe_ks`             | LWE Ciphertext encrypting A after a keyswitch from `large_lwe_secret_key` to `small_lwe_secret_key` | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `glwe_after_id_br`   | GLWE Ciphertext encrypting A after the application of the identity blind rotation on `lwe_ms`       | `GlweCiphertext<Vec<u64>>` | rotation of id LUT   |
-| `lwe_after_id_pbs`   | LWE Ciphertext encrypting A after the sample extract operation on `glwe_after_id_br`                | `LweCiphertext<Vec<u64>>`  | `A`                  |
-| `glwe_after_spec_br` | GLWE Ciphertext encrypting spec(A) after the application of the spec blind rotation on `lwe_ms`     | `GlweCiphertext<Vec<u64>>` | rotation of spec LUT |
-| `lwe_after_spec_pbs` | LWE Ciphertext encrypting spec(A) after the sample extract operation on `glwe_after_spec_br`        | `LweCiphertext<Vec<u64>>`  | `spec(A)`            |
+| name                 | description                                                                                                  | TFHE-rs type               | Cleartext    |
+|----------------------|--------------------------------------------------------------------------------------------------------------|----------------------------|--------------|
+| `lwe_a`              | Lwe encryption of A                                                                                          | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `lwe_b`              | Lwe encryption of B                                                                                          | `LweCiphertext<Vec<u64>>`  | `B`          |
+| `lwe_sum`            | Lwe encryption of A plus lwe encryption of B                                                                 | `LweCiphertext<Vec<u64>>`  | `A+B`        |
+| `lwe_prod`           | Lwe encryption of A times cleartext B                                                                        | `LweCiphertext<Vec<u64>>`  | `A*B`        |
+| `lwe_ms`             | The lwe ciphertext after the modswitch part of the PBS ([note](#non-native-encoding))                        | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `lwe_ks`             | The lwe ciphertext after the keyswitch                                                                       | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `glwe_after_id_br`   | The glwe returned by the application of the identity blind rotation on the mod switched ciphertexts.         | `GlweCiphertext<Vec<u64>>` | rot id LUT   |
+| `lwe_after_id_pbs`   | The lwe returned by the application of the sample extract operation on the output of the id blind rotation   | `LweCiphertext<Vec<u64>>`  | `A`          |
+| `glwe_after_spec_br` | The glwe returned by the application of the spec blind rotation on the mod switched ciphertexts.             | `GlweCiphertext<Vec<u64>>` | rot spec LUT |
+| `lwe_after_spec_pbs` | The lwe returned by the application of the sample extract operation on the output of the spec blind rotation | `LweCiphertext<Vec<u64>>`  | `spec(A)`    |

 Ciphertexts with the `_karatsuba` suffix are generated using the Karatsuba polynomial multiplication algorithm in the blind rotation, while default ciphertexts are generated using an FFT multiplication.
 This makes it easier to reproduce bit exact results.
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.13.0"
+version = "0.13.2"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -86,7 +86,6 @@ fn main() {
            "cuda/include/integer/integer.h",
            "cuda/include/integer/rerand.h",
            "cuda/include/aes/aes.h",
-            "cuda/include/trivium/trivium.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -29,13 +29,15 @@ template <typename Torus> struct int_aes_lut_buffers {
        allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus, Torus)> and_lambda =
        [](Torus a, Torus b) -> Torus { return a & b; };
-
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
+        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, and_lambda, allocate_gpu_memory);
    auto active_streams_and_lut = streams.active_gpu_subset(
        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
        params.pbs_type);
-    this->and_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
-
+    this->and_lut->broadcast_lut(active_streams_and_lut);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->flush_lut = new int_radix_lut<Torus>(
@@ -44,11 +46,14 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
      return x & 1;
    };
-
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
+        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, flush_lambda, allocate_gpu_memory);
    auto active_streams_flush_lut = streams.active_gpu_subset(
        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
-    this->flush_lut->generate_and_broadcast_lut(
-        active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
+    this->flush_lut->broadcast_lut(active_streams_flush_lut);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

    this->carry_lut = new int_radix_lut<Torus>(
@@ -56,11 +61,14 @@ template <typename Torus> struct int_aes_lut_buffers {
    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
      return (x >> 1) & 1;
    };
-
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
+        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_lambda, allocate_gpu_memory);
    auto active_streams_carry_lut =
        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
-    this->carry_lut->generate_and_broadcast_lut(
-        active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
+    this->carry_lut->broadcast_lut(active_streams_carry_lut);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -10,11 +10,7 @@ extern std::mutex m;
 extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
 extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
-
-extern "C" {
-int32_t cuda_setup_multi_gpu(int device_0_id);
-}
-
+extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
 // Define a variant type that can be either a vector or a single pointer
 template <typename Torus>
 using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,

 uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                              PBS_TYPE pbs_type);
+uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
+                                   PBS_TYPE pbs_type);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -70,7 +68,7 @@ public:
  // Construct an empty set. Invalid use of an empty set should raise an error
  // right away through asserts or because of a nullptr dereference
  CudaStreams()
-      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
+      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count(0),
        _owns_streams(false) {}

  // Returns a subset of this set as an active subset. An active subset is one
@@ -80,7 +78,15 @@ public:
        _streams, _gpu_indexes,
        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
  }
-
+  // Returns a subset of this set as an active subset for pbs128. An active
+  // subset is one that is temporarily used to perform some computation. For
+  // pbs128, the threshold is different, because the original threshold was
+  // designed for 2_2 params.
+  CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
+    return CudaStreams(
+        _streams, _gpu_indexes,
+        get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
+  }
  // Returns a CudaStreams struct containing only the ith stream
  CudaStreams get_ith(int i) const {
    return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -114,11 +120,13 @@ public:
  // streams on the same GPU
  void create_on_same_gpus(const CudaStreams &other) {
    PANIC_IF_FALSE(_streams == nullptr,
-                   "Assign clone to non-empty cudastreams");
+                   "Cuda error: Assign clone to non-empty CudaStreams");
+    PANIC_IF_FALSE(_gpu_count <= 8,
+                   "Cuda error: GPU count should be in the interval [0, 8]");

    cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];

-    uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
+    uint32_t *gpu_indexes_clone = new uint32_t[other._gpu_count];
    for (uint32_t i = 0; i < other._gpu_count; ++i) {
      new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
      gpu_indexes_clone[i] = other._gpu_indexes[i];
@@ -170,6 +178,7 @@ public:
      _streams = nullptr;
      delete[] _gpu_indexes;
      _gpu_indexes = nullptr;
+      _gpu_count = 0;
    }
  }

@@ -483,4 +492,38 @@ public:
  }
 };

+// Event pool for managing temporary CUDA events in scatter/gather operations
+struct CudaEventPool {
+private:
+  std::vector<cudaEvent_t> _events;
+  std::vector<uint32_t> _gpu_indices;
+
+public:
+  CudaEventPool() {}
+
+  // Requests a new event from the pool (creates and stores it)
+  cudaEvent_t request_event(uint32_t gpu_index) {
+    cudaEvent_t event = cuda_create_event(gpu_index);
+    _events.push_back(event);
+    _gpu_indices.push_back(gpu_index);
+    return event;
+  }
+
+  // Releases all pooled events
+  // This should always be called in the release of the LUT, so streams
+  // are already synchronized
+  void release() {
+    for (size_t i = 0; i < _events.size(); i++) {
+      cuda_event_destroy(_events[i], _gpu_indices[i]);
+    }
+    _events.clear();
+    _gpu_indices.clear();
+  }
+
+  ~CudaEventPool() {
+    GPU_ASSERT(_events.empty(),
+               "CudaEventPool: must call release before destruction");
+  }
+};
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -65,8 +65,14 @@ template <typename Torus> struct boolean_bitop_buffer {
        return x % params.message_modulus;
      };

-      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
+      message_extract_lut->broadcast_lut(active_streams);
    }
    tmp_lwe_left = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -136,8 +142,12 @@ template <typename Torus> struct int_bitop_buffer {
          }
        };

-        lut->generate_and_broadcast_bivariate_lut(
-            active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
+        generate_device_accumulator_bivariate<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+            lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
      }
      break;
    default:
@@ -146,8 +156,6 @@ template <typename Torus> struct int_bitop_buffer {
                                     num_radix_blocks, allocate_gpu_memory,
                                     size_tracker);

-      std::vector<std::function<Torus(Torus)>> lut_funcs;
-      std::vector<uint32_t> lut_indices;
      for (int i = 0; i < params.message_modulus; i++) {
        auto rhs = i;

@@ -163,13 +171,14 @@ template <typename Torus> struct int_bitop_buffer {
            return x ^ rhs;
          }
        };
-
-        lut_funcs.push_back(lut_univariate_scalar_f);
-        lut_indices.push_back(i);
+        generate_device_accumulator<Torus>(
+            streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
+            lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_univariate_scalar_f,
+            gpu_memory_allocated);
+        lut->broadcast_lut(active_streams);
      }
-
-      lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
-                                      gpu_memory_allocated);
    }
  }

@@ -202,11 +211,16 @@ template <typename Torus> struct boolean_bitnot_buffer {
        return x % message_modulus;
      };

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          message_extract_lut->get_lut(0, 0),
+          message_extract_lut->get_degree(0),
+          message_extract_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_message_extract, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
-
-      message_extract_lut->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      message_extract_lut->broadcast_lut(active_streams);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -28,17 +28,21 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
      uint32_t bits_per_block = std::log2(params.message_modulus);
      uint32_t msg_modulus = params.message_modulus;

-      auto active_streams =
-          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-
-      lut->generate_and_broadcast_lut(
-          active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          [msg_modulus, bits_per_block](Torus x) {
            const auto xm = x % msg_modulus;
            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
            return (Torus)((msg_modulus - 1) * sign_bit);
-          }},
+          },
          allocate_gpu_memory);

+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+      lut->broadcast_lut(active_streams);
+
      this->last_block = new CudaRadixCiphertextFFI;

      create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -85,6 +85,24 @@ template <typename Torus> struct int_cmux_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
+        predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
+        predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
+        message_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        message_extract_lut_f, gpu_memory_allocated);
    Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
    for (int index = 0; index < 2 * num_radix_blocks; index++) {
      if (index < num_radix_blocks) {
@@ -97,18 +115,12 @@ template <typename Torus> struct int_cmux_buffer {
        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-
    auto active_streams_pred =
        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
-    predicate_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
-        gpu_memory_allocated);
-
+    predicate_lut->broadcast_lut(active_streams_pred);
    auto active_streams_msg =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-
-    message_extract_lut->generate_and_broadcast_lut(
-        active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
+    message_extract_lut->broadcast_lut(active_streams_msg);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
    Torus total_modulus = params.message_modulus * params.carry_modulus;
    uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

-    int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
+    int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
    tmp_out = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
@@ -39,21 +39,22 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        max_chunks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

-    preallocated_h_lut = (Torus *)malloc(
-        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
-
    is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
                                            allocate_gpu_memory, size_tracker);
-
-    auto active_streams =
-        streams.active_gpu_subset(max_chunks, params.pbs_type);
-
    auto is_max_value_f = [max_value](Torus x) -> Torus {
      return x == max_value;
    };
+    preallocated_h_lut = (Torus *)malloc(
+        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
+        is_max_value->get_degree(0), is_max_value->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_max_value_f, gpu_memory_allocated);

-    is_max_value->generate_and_broadcast_lut(
-        active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
+    auto active_streams =
+        streams.active_gpu_subset(max_chunks, params.pbs_type);
+    is_max_value->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
@@ -102,10 +103,15 @@ template <typename Torus> struct int_comparison_eq_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
+        is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    is_non_zero_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);
+    is_non_zero_lut->broadcast_lut(active_streams);

    // Scalar may have up to num_radix_blocks blocks
    scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -123,28 +129,32 @@ template <typename Torus> struct int_comparison_eq_buffer {
        return (lhs == rhs);
      }
    };
-
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < total_modulus; i++) {
      auto lut_f = [i, operator_f](Torus x) -> Torus {
        return operator_f(i, x);
      };
-      lut_funcs.push_back(lut_f);
-      lut_indices.push_back(i);
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          scalar_comparison_luts->get_lut(0, i),
+          scalar_comparison_luts->get_degree(i),
+          scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f, gpu_memory_allocated);
    }
-
-    scalar_comparison_luts->generate_and_broadcast_lut(
-        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
-
+    scalar_comparison_luts->broadcast_lut(active_streams);
    if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
      operator_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-      operator_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {operator_f}, gpu_memory_allocated);
-      // operator_lut->broadcast_lut(active_streams);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
+          operator_lut->get_degree(0), operator_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, operator_f, gpu_memory_allocated);
+
+      operator_lut->broadcast_lut(active_streams);
    } else {
      operator_lut = nullptr;
    }
@@ -211,6 +221,9 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    // LUTs
+    tree_inner_leaf_lut =
+        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
+                                 allocate_gpu_memory, size_tracker);

    tree_last_leaf_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -221,14 +234,15 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
        streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-    tree_inner_leaf_lut =
-        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
-                                 allocate_gpu_memory, size_tracker);
-
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
+        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        block_selector_f, gpu_memory_allocated);
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
+    tree_inner_leaf_lut->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
@@ -412,8 +426,12 @@ template <typename Torus> struct int_comparison_buffer {
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);

-    identity_lut->generate_and_broadcast_lut(
-        active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
+        identity_lut->get_degree(0), identity_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, identity_lut_f, gpu_memory_allocated);
+    identity_lut->broadcast_lut(active_streams);

    uint32_t total_modulus = params.message_modulus * params.carry_modulus;
    auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -423,8 +441,13 @@ template <typename Torus> struct int_comparison_buffer {
    is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                           allocate_gpu_memory, size_tracker);

-    is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
-                                            gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
+        is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, is_zero_f, gpu_memory_allocated);
+
+    is_zero_lut->broadcast_lut(active_streams);

    switch (op) {
    case COMPARISON_TYPE::MAX:
@@ -499,9 +522,13 @@ template <typename Torus> struct int_comparison_buffer {
        PANIC("Cuda error: sign_lut creation failed due to wrong function.")
      };

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
+          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      signed_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
+      signed_lut->broadcast_lut(active_streams);
    }
    preallocated_h_lut = (Torus *)malloc(
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -11,16 +11,26 @@ template <typename Torus> struct int_compression {
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
  uint32_t lwe_per_glwe;
+  uint32_t max_num_glwes;

+  // num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
+  // compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
+  // defined by the chosen parameter set
  int_compression(CudaStreams streams, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;

+    // Calculate the actual number of GLWEs needed based on total radix blocks.
+    // This ensures we allocate enough memory when num_radix_blocks >
+    // lwe_per_glwe.
+    max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
+
    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
            sizeof(Torus),
@@ -28,7 +38,7 @@ template <typename Torus> struct int_compression {
        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
+            max_num_glwes * glwe_accumulator_size * sizeof(Torus),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory));

--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -283,9 +283,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
                                     zero_out_if_not_1_lut_2};
    size_t lut_gpu_indexes[2] = {0, 3};
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
-                                          {0}, {zero_out_if_not_1_lut_f},
-                                          gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(lut_gpu_indexes[j]),
+          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
    }

    luts[0] = zero_out_if_not_2_lut_1;
@@ -293,9 +296,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    lut_gpu_indexes[0] = 1;
    lut_gpu_indexes[1] = 2;
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
-                                          {0}, {zero_out_if_not_2_lut_f},
-                                          gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(lut_gpu_indexes[j]),
+          streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
    }

    quotient_lut_1 =
@@ -315,12 +321,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    };
    auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };

-    quotient_lut_1->generate_and_broadcast_lut(
-        streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
-    quotient_lut_2->generate_and_broadcast_lut(
-        streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
-    quotient_lut_3->generate_and_broadcast_lut(
-        streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
+        quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
+        quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
+        quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);

    message_extract_lut_1 = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -335,12 +350,15 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
    luts[0] = message_extract_lut_1;
    luts[1] = message_extract_lut_2;

-    auto active_streams =
-        streams.active_gpu_subset(num_blocks, params.pbs_type);
-
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
+      auto active_streams =
+          streams.active_gpu_subset(num_blocks, params.pbs_type);
+      luts[j]->broadcast_lut(active_streams);
    }
  }

@@ -989,14 +1007,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      masking_luts_2[i] = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
+          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
-      masking_luts_1[i]->generate_and_broadcast_lut(
-          active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);
+      masking_luts_1[i]->broadcast_lut(active_streams_1);

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
+          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_masking, gpu_memory_allocated);
      auto active_streams_2 =
          streams.active_gpu_subset(num_blocks, params.pbs_type);
-      masking_luts_2[i]->generate_and_broadcast_lut(
-          active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
+      masking_luts_2[i]->broadcast_lut(active_streams_2);
    }

    // create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1014,12 +1042,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
-
    auto active_streams =
        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
-      luts[j]->generate_and_broadcast_lut(
-          active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
+          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
+      luts[j]->broadcast_lut(active_streams);
    }

    // Give name to closures to improve readability
@@ -1110,8 +1141,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
      merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

-      merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
-          active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          merge_overflow_flags_luts[i]->get_lut(0, 0),
+          merge_overflow_flags_luts[i]->get_degree(0),
+          merge_overflow_flags_luts[i]->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, lut_f_bit, gpu_memory_allocated);
+      merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
    }
  }

@@ -1520,12 +1557,16 @@ template <typename Torus> struct int_div_rem_memory {
      compare_signed_bits_lut = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          compare_signed_bits_lut->get_lut(0, 0),
+          compare_signed_bits_lut->get_degree(0),
+          compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
-
-      compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
-          active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
-          gpu_memory_allocated);
+      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -53,8 +53,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return count;
    };

-    univ_lut_mem->generate_and_broadcast_lut(
-        active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
+        univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
+
+    univ_lut_mem->broadcast_lut(active_streams);

    auto generate_bi_lut_lambda =
        [num_bits](Torus block_num_bit_count,
@@ -65,8 +70,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
      return 0;
    };

-    biv_lut_mem->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
+        biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
+
+    biv_lut_mem->broadcast_lut(active_streams);

    this->tmp_ct = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -222,7 +232,7 @@ template <typename Torus> struct int_ilog2_buffer {
        this->sum_output_not_propagated, counter_num_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

-    lut_message_not =
+    this->lut_message_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
                                 allocate_gpu_memory, size_tracker);
    std::function<Torus(Torus)> lut_message_lambda =
@@ -230,11 +240,16 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t message = x % this->params.message_modulus;
      return (~message) % this->params.message_modulus;
    };
-
+    generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
+                                this->lut_message_not->get_lut(0, 0),
+                                this->lut_message_not->get_degree(0),
+                                this->lut_message_not->get_max_degree(0),
+                                params.glwe_dimension, params.polynomial_size,
+                                params.message_modulus, params.carry_modulus,
+                                lut_message_lambda, allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
-    lut_message_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);
+    lut_message_not->broadcast_lut(active_streams);

    this->lut_carry_not =
        new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -244,8 +259,13 @@ template <typename Torus> struct int_ilog2_buffer {
      uint64_t carry = x / this->params.message_modulus;
      return (~carry) % this->params.message_modulus;
    };
-    lut_carry_not->generate_and_broadcast_lut(
-        active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);
+    generate_device_accumulator(
+        streams.stream(0), streams.gpu_index(0),
+        this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
+        this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_carry_lambda, allocate_gpu_memory);
+    lut_carry_not->broadcast_lut(active_streams);

    this->message_blocks_not = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -9,7 +9,6 @@
 #include "utils/helper_multi_gpu.cuh"
 #include <cmath>
 #include <functional>
-#include <map>
 #include <queue>

 #include <stdio.h>
@@ -35,8 +34,10 @@ public:
    } else if ((msg_mod) == 4 && (carry_mod) == 4) {                           \
      constexpr int max_noise_level = 5;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
-        PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 "    \
-              "parameters");                                                   \
+        PANIC(                                                                 \
+            "Cuda error: noise %d exceeds maximum authorized value 5 for 2_2"  \
+            " parameters",                                                     \
+            noise_level_expr);                                                 \
    } else if ((msg_mod) == 8 && (carry_mod) == 8) {                           \
      constexpr int max_noise_level = 9;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
@@ -350,6 +351,7 @@ struct int_radix_lut_custom_input_output {

  CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
  CudaStreamsBarrier multi_gpu_gather_barrier;
+  CudaEventPool event_pool;

  // Setup the LUT configuration:
  // input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
@@ -372,8 +374,13 @@ struct int_radix_lut_custom_input_output {
    this->num_input_blocks = num_input_blocks;
    this->gpu_memory_allocated = allocate_gpu_memory;

-    this->active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    if (sizeof(OutputTorus) == 16) {
+      this->active_streams =
+          streams.active_gpu_subset_u128(num_radix_blocks, params.pbs_type);
+    } else {
+      this->active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    }
  }

  void setup_degrees() {
@@ -385,9 +392,13 @@ struct int_radix_lut_custom_input_output {
  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {

+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    for (uint i = 0; i < active_streams.count(); i++) {
      cuda_set_device(active_streams.gpu_index(i));
@@ -459,11 +470,14 @@ struct int_radix_lut_custom_input_output {
                               lwe_trivial_indexes, num_radix_blocks,
                               allocate_gpu_memory);
  }
-
  void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    auto inputs_on_gpu = std::min(
        (int)num_input_blocks,
@@ -810,10 +824,13 @@ struct int_radix_lut_custom_input_output {
  void allocate_lwe_vector_for_non_trivial_indexes(
      CudaStreams streams, uint64_t max_num_radix_blocks,
      uint64_t &size_tracker, bool allocate_gpu_memory) {
-
+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    // We need to create the auxiliary array only in GPU 0
    if (active_streams.count() > 1) {
@@ -836,56 +853,6 @@ struct int_radix_lut_custom_input_output {
    }
  }

-  void generate_and_broadcast_lut(
-      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
-      std::vector<std::function<OutputTorus(OutputTorus)>> f,
-      bool gpu_memory_allocated) {
-    // streams should be a subset of active_streams
-
-    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
-      generate_device_accumulator<OutputTorus>(
-          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
-          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, f[i], gpu_memory_allocated);
-    }
-   //broadcast_lut(streams);
-  }
-
-  void generate_and_broadcast_bivariate_lut(
-      const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
-      std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
-      bool gpu_memory_allocated) {
-    // streams should be a subset of active_streams
-
-    /*    for (int fidx = 0; fidx < f.size(); ++fidx) {
-          __int128_t f_hash = 0;
-          uint32_t bits_per_lut_val = 5;
-          uint32_t input_modulus_sup =
-              params.message_modulus * params.carry_modulus;
-          for (uint32_t i = 0; i < input_modulus_sup; ++i) {
-            OutputTorus f_eval =
-                f[fidx](i / params.message_modulus, i % params.message_modulus);
-            GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
-                       "LUT value expected bitwidth overflow");
-            f_hash |= f_eval;
-            f_hash <<= bits_per_lut_val;
-          }
-          printf("%016llX%016llX\n",
-                 (unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
-                 (unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
-        }
-    */
-    for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
-      generate_device_accumulator_bivariate<InputTorus>(
-          streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
-          get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
-          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, f[i], gpu_memory_allocated);
-    }
-    //broadcast_lut(streams);
-  }
-
  void release(CudaStreams streams) {
    PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
                   "Lut vec and Lut vec indexes must have the same size");
@@ -916,6 +883,7 @@ struct int_radix_lut_custom_input_output {

    if (active_streams.count() > 1) {
      active_streams.synchronize();
+      event_pool.release();
      multi_gpu_gather_barrier.release();
      multi_gpu_broadcast_barrier.release();
      multi_gpu_scatter_barrier.release();
@@ -1036,15 +1004,18 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                                   bits_per_block * num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

-    std::vector<std::function<Torus(Torus)>> lut_funs;
-    std::vector<uint32_t> lut_indices;
    for (int i = 0; i < bits_per_block; i++) {
+
      auto operator_f = [i, final_offset](Torus x) -> Torus {
        Torus y = (x >> i) & 1;
        return y << final_offset;
      };
-      lut_funs.push_back(operator_f);
-      lut_indices.push_back(i);
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
+          lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          operator_f, gpu_memory_allocated);
    }

    /**
@@ -1063,10 +1034,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {

    auto active_streams = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);
-
-    lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
-                                    gpu_memory_allocated);
-    // lut->broadcast_lut(active_streams);
+    lut->broadcast_lut(active_streams);

    /**
     * the input indexes should take the first bits_per_block PBS to target
@@ -1142,6 +1110,24 @@ template <typename Torus> struct int_fullprop_buffer {
    };

    //
+    Torus *lut_buffer_message = lut->get_lut(0, 0);
+    uint64_t *message_degree = lut->get_degree(0);
+    uint64_t *message_max_degree = lut->get_max_degree(0);
+    Torus *lut_buffer_carry = lut->get_lut(0, 1);
+    uint64_t *carry_degree = lut->get_degree(1);
+    uint64_t *carry_max_degree = lut->get_max_degree(1);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut_buffer_message,
+        message_degree, message_max_degree, params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_f_message, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
+        carry_max_degree, params.glwe_dimension, params.polynomial_size,
+        params.message_modulus, params.carry_modulus, lut_f_carry,
+        gpu_memory_allocated);

    uint64_t lwe_indexes_size = 2 * sizeof(Torus);
    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
@@ -1151,15 +1137,9 @@ template <typename Torus> struct int_fullprop_buffer {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-
    //
    // No broadcast is needed because full prop is done on 1 single GPU.
-    // By passing a single-GPU CudaStreams with streams.get_ith(0) the LUT is
-    // not broadcast.
    //
-    lut->generate_and_broadcast_lut(streams.get_ith(0), {0, 1},
-                                    {lut_f_message, lut_f_carry},
-                                    gpu_memory_allocated);

    tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -1277,10 +1257,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      if (total_ciphertexts > 0 ||
          reduce_degrees_for_single_carry_propagation) {
        uint64_t size_tracker = 0;
-        allocated_luts_message_carry = true;
        luts_message_carry = new int_radix_lut<Torus>(
            streams, params, 2, pbs_count, true, size_tracker);
-
+        allocated_luts_message_carry = true;
        uint64_t message_modulus_bits =
            (uint64_t)std::log2(params.message_modulus);
        uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
@@ -1296,9 +1275,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
            streams, upper_bound_num_blocks, size_tracker, true);
      }
    }
-
    if (allocated_luts_message_carry) {
-
      auto message_acc = luts_message_carry->get_lut(0, 0);
      auto carry_acc = luts_message_carry->get_lut(0, 1);

@@ -1310,11 +1287,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
        return x / message_modulus;
      };

+      // generate accumulators
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), message_acc,
+          luts_message_carry->get_degree(0),
+          luts_message_carry->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, message_modulus, params.carry_modulus,
+          lut_f_message, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), carry_acc,
+          luts_message_carry->get_degree(1),
+          luts_message_carry->get_max_degree(1), params.glwe_dimension,
+          params.polynomial_size, message_modulus, params.carry_modulus,
+          lut_f_carry, gpu_memory_allocated);
      auto active_gpu_count_mc =
          streams.active_gpu_subset(pbs_count, params.pbs_type);
-      luts_message_carry->generate_and_broadcast_lut(
-          active_gpu_count_mc, {0, 1}, {lut_f_message, lut_f_carry},
-          gpu_memory_allocated);
+      luts_message_carry->broadcast_lut(active_gpu_count_mc);
    }
  }
  int_sum_ciphertexts_vec_memory(
@@ -1449,6 +1437,10 @@ template <typename Torus> struct int_seq_group_prop_memory {
                            uint32_t group_size, uint32_t big_lwe_size_bytes,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;

    grouping_size = group_size;
    group_resolved_carries = new CudaRadixCiphertextFFI;
@@ -1458,20 +1450,22 @@ template <typename Torus> struct int_seq_group_prop_memory {
        allocate_gpu_memory);

    int num_seq_luts = grouping_size - 1;
+    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
    lut_sequential_algorithm =
        new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
                                 allocate_gpu_memory, size_tracker);
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_indices;
-    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
-
    for (int index = 0; index < num_seq_luts; index++) {
      auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
        return (propa_cum_sum_block >> (index + 1)) & 1;
      };
-      lut_funcs.push_back(f_lut_sequential);
+      auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), seq_lut,
+          lut_sequential_algorithm->get_degree(index),
+          lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
+          gpu_memory_allocated);
      h_seq_lut_indexes[index] = index;
-      lut_indices.push_back(index);
    }
    Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1479,12 +1473,9 @@ template <typename Torus> struct int_seq_group_prop_memory {
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_seq_luts, params.pbs_type);
-    lut_sequential_algorithm->generate_and_broadcast_lut(
-        active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
-    // lut_sequential_algorithm->broadcast_lut(active_streams);
+    lut_sequential_algorithm->broadcast_lut(active_streams);
    free(h_seq_lut_indexes);
-  }
-
+  };
  void release(CudaStreams streams) {
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   group_resolved_carries,
@@ -1506,6 +1497,10 @@ template <typename Torus> struct int_hs_group_prop_memory {
                           uint32_t num_groups, uint32_t big_lwe_size_bytes,
                           bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
+    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;

    auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
      if (msb == 2) {
@@ -1525,11 +1520,16 @@ template <typename Torus> struct int_hs_group_prop_memory {
    lut_hillis_steele = new int_radix_lut<Torus>(
        streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);

+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
+        lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_lut_hillis_steele,
+        gpu_memory_allocated);
    auto active_streams =
        streams.active_gpu_subset(num_groups, params.pbs_type);
-    lut_hillis_steele->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {f_lut_hillis_steele}, gpu_memory_allocated);
-  }
+    lut_hillis_steele->broadcast_lut(active_streams);
+  };
  void release(CudaStreams streams) {

    lut_hillis_steele->release(streams);
@@ -1819,6 +1819,112 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
      num_extra_luts = 1;
    }

+    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
+    luts_array_second_step = new int_radix_lut<Torus>(
+        streams, params, num_luts_second_step, num_radix_blocks,
+        allocate_gpu_memory, size_tracker);
+
+    // luts for first group inner propagation
+    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
+      auto f_first_grouping_inner_propagation =
+          [lut_id](Torus propa_cum_sum_block) -> Torus {
+        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
+
+        if (carry != 0) {
+          return 2ull; // Generates Carry
+        } else {
+          return 0ull; // Does not generate carry
+        }
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          luts_array_second_step->get_lut(0, lut_id),
+          luts_array_second_step->get_degree(lut_id),
+          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus,
+          f_first_grouping_inner_propagation, gpu_memory_allocated);
+    }
+
+    auto f_first_grouping_outer_propagation =
+        [num_bits_in_block](Torus block) -> Torus {
+      return (block >> (num_bits_in_block - 1)) & 1;
+    };
+
+    int lut_id = grouping_size - 1;
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        luts_array_second_step->get_lut(0, lut_id),
+        luts_array_second_step->get_degree(lut_id),
+        luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        f_first_grouping_outer_propagation, gpu_memory_allocated);
+
+    // for other groupings inner propagation
+    for (int index = 0; index < grouping_size; index++) {
+      uint32_t lut_id = index + grouping_size;
+
+      auto f_other_groupings_inner_propagation =
+          [index](Torus propa_cum_sum_block) -> Torus {
+        uint64_t mask = (2 << index) - 1;
+        if (propa_cum_sum_block >= (2 << index)) {
+          return 2ull; // Generates
+        } else if ((propa_cum_sum_block & mask) == mask) {
+          return 1ull; // Propagate
+        } else {
+          return 0ull; // Nothing
+        }
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          luts_array_second_step->get_lut(0, lut_id),
+          luts_array_second_step->get_degree(lut_id),
+          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus,
+          f_other_groupings_inner_propagation, gpu_memory_allocated);
+    }
+
+    if (use_sequential_algorithm_to_resolve_group_carries) {
+      for (int index = 0; index < grouping_size - 1; index++) {
+        uint32_t lut_id = index + 2 * grouping_size;
+
+        auto f_group_propagation = [index, block_modulus,
+                                    num_bits_in_block](Torus block) -> Torus {
+          if (block == (block_modulus - 1)) {
+            return 0ull;
+          } else {
+            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
+          }
+        };
+
+        generate_device_accumulator<Torus>(
+            streams.stream(0), streams.gpu_index(0),
+            luts_array_second_step->get_lut(0, lut_id),
+            luts_array_second_step->get_degree(lut_id),
+            luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+            polynomial_size, message_modulus, carry_modulus,
+            f_group_propagation, gpu_memory_allocated);
+      }
+    } else {
+      uint32_t lut_id = 2 * grouping_size;
+      auto f_group_propagation = [block_modulus](Torus block) {
+        if (block == (block_modulus - 1)) {
+          return 2ull;
+        } else {
+          return UINT64_MAX % (block_modulus * 2ull);
+        }
+      };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          luts_array_second_step->get_lut(0, lut_id),
+          luts_array_second_step->get_degree(lut_id),
+          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_group_propagation,
+          gpu_memory_allocated);
+    }
+
    Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);

    for (int index = 0; index < num_radix_blocks; index++) {
@@ -1854,11 +1960,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
      }
    }

-    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
-    luts_array_second_step = new int_radix_lut<Torus>(
-        streams, params, num_luts_second_step, num_radix_blocks,
-        allocate_gpu_memory, size_tracker);
-
    // copy the indexes to the gpu
    Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1869,92 +1970,9 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
        scalar_array_cum_sum, h_scalar_array_cum_sum,
        num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-
-    std::vector<std::function<Torus(Torus)>> lut_funcs;
-    std::vector<uint32_t> lut_ids;
-
-    // luts for first group inner propagation
-    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
-      auto f_first_grouping_inner_propagation =
-          [lut_id](Torus propa_cum_sum_block) -> Torus {
-        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
-
-        if (carry != 0) {
-          return 2ull; // Generates Carry
-        } else {
-          return 0ull; // Does not generate carry
-        }
-      };
-      lut_funcs.push_back(f_first_grouping_inner_propagation);
-      lut_ids.push_back(lut_id);
-    }
-
-    auto f_first_grouping_outer_propagation =
-        [num_bits_in_block](Torus block) -> Torus {
-      return (block >> (num_bits_in_block - 1)) & 1;
-    };
-
-    int lut_id = grouping_size - 1;
-
-    lut_funcs.push_back(f_first_grouping_outer_propagation);
-    lut_ids.push_back(lut_id);
-
-    // for other groupings inner propagation
-    for (int index = 0; index < grouping_size; index++) {
-      uint32_t lut_id = index + grouping_size;
-
-      auto f_other_groupings_inner_propagation =
-          [index](Torus propa_cum_sum_block) -> Torus {
-        uint64_t mask = (2 << index) - 1;
-        if (propa_cum_sum_block >= (2 << index)) {
-          return 2ull; // Generates
-        } else if ((propa_cum_sum_block & mask) == mask) {
-          return 1ull; // Propagate
-        } else {
-          return 0ull; // Nothing
-        }
-      };
-
-      lut_funcs.push_back(f_other_groupings_inner_propagation);
-      lut_ids.push_back(lut_id);
-    }
-
-    if (use_sequential_algorithm_to_resolve_group_carries) {
-      for (int index = 0; index < grouping_size - 1; index++) {
-        uint32_t lut_id = index + 2 * grouping_size;
-
-        auto f_group_propagation = [index, block_modulus,
-                                    num_bits_in_block](Torus block) -> Torus {
-          if (block == (block_modulus - 1)) {
-            return 0ull;
-          } else {
-            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
-          }
-        };
-
-        lut_funcs.push_back(f_group_propagation);
-        lut_ids.push_back(lut_id);
-      }
-    } else {
-      uint32_t lut_id = 2 * grouping_size;
-      auto f_group_propagation = [block_modulus](Torus block) {
-        if (block == (block_modulus - 1)) {
-          return 2ull;
-        } else {
-          return UINT64_MAX % (block_modulus * 2ull);
-        }
-      };
-
-      lut_funcs.push_back(f_group_propagation);
-      lut_ids.push_back(lut_id);
-    }
-
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_array_second_step->generate_and_broadcast_lut(
-        active_streams, lut_ids, lut_funcs, gpu_memory_allocated);
-
-    // luts_array_second_step->broadcast_lut(active_streams);
+    luts_array_second_step->broadcast_lut(active_streams);

    if (use_sequential_algorithm_to_resolve_group_carries) {

@@ -2042,28 +2060,12 @@ template <typename Torus> struct int_sc_prop_memory {
  uint32_t requested_flag;
  bool gpu_memory_allocated;

-  void setup_message_extract_indices_for_carry_async(CudaStreams streams,
-                                                     uint32_t num_radix_blocks,
-                                                     bool allocate_gpu_memory) {
-    Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
-    for (int index = 0; index < num_radix_blocks + 1; index++) {
-      if (index < num_radix_blocks) {
-        h_lut_indexes[index] = 0;
-      } else {
-        h_lut_indexes[index] = 1;
-      }
-    }
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
-        (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
-        streams.gpu_index(0), allocate_gpu_memory);
-  }
-
  int_sc_prop_memory(CudaStreams streams, int_radix_params params,
                     uint32_t num_radix_blocks, uint32_t requested_flag_in,
                     bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
+    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
@@ -2072,7 +2074,7 @@ template <typename Torus> struct int_sc_prop_memory {
    uint32_t block_modulus = message_modulus * carry_modulus;
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
-    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+    num_groups = CEIL_DIV(num_radix_blocks, grouping_size);

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
@@ -2086,6 +2088,24 @@ template <typename Torus> struct int_sc_prop_memory {
        streams, params, num_radix_blocks, grouping_size, num_groups,
        allocate_gpu_memory, size_tracker);

+    //  Step 3 elements
+    int num_luts_message_extract =
+        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
+    lut_message_extract = new int_radix_lut<Torus>(
+        streams, params, num_luts_message_extract, num_radix_blocks + 1,
+        allocate_gpu_memory, size_tracker);
+    // lut for the first block in the first grouping
+    auto f_message_extract = [message_modulus](Torus block) -> Torus {
+      return (block >> 1) % message_modulus;
+    };
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
+        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_message_extract,
+        gpu_memory_allocated);
+
    // This store a single block that with be used to store the overflow or
    // carry results
    output_flag = new CudaRadixCiphertextFFI;
@@ -2136,30 +2156,22 @@ template <typename Torus> struct int_sc_prop_memory {
        return output1 << 3 | output2 << 2;
      };

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_overflow_flag_prep->get_lut(0, 0),
+          lut_overflow_flag_prep->get_degree(0),
+          lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
+          gpu_memory_allocated);
+
      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
-      lut_overflow_flag_prep->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {f_overflow_fp}, gpu_memory_allocated);
+      lut_overflow_flag_prep->broadcast_lut(active_streams);
    }

-    //  Step 3 elements
-    int num_luts_message_extract =
-        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
-    lut_message_extract = new int_radix_lut<Torus>(
-        streams, params, num_luts_message_extract, num_radix_blocks + 1,
-        allocate_gpu_memory, size_tracker);
-    // lut for the first block in the first grouping
-    auto f_message_extract = [message_modulus](Torus block) -> Torus {
-      return (block >> 1) % message_modulus;
-    };
-
-    auto active_streams =
-        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
-
    // For the final cleanup in case of overflow or carry (it seems that I can)
    // It seems that this lut could be apply together with the other one but for
    // now we won't do it
-    switch (requested_flag) {
-    case outputFlag::FLAG_OVERFLOW: { // Overflow case
+    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
      auto f_overflow_last = [num_radix_blocks,
                              requested_flag_in](Torus block) -> Torus {
        uint32_t position = (num_radix_blocks == 1 &&
@@ -2171,38 +2183,62 @@ template <typename Torus> struct int_sc_prop_memory {
        Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
        if (input_carry == outputFlag::FLAG_OVERFLOW) {
          return does_overflow_if_carry_is_1;
+        } else {
+          return does_overflow_if_carry_is_0;
        }
-        return does_overflow_if_carry_is_0;
      };
-      setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
-                                                    allocate_gpu_memory);

-      lut_message_extract->generate_and_broadcast_lut(
-          active_streams, {0, 1}, {f_message_extract, f_overflow_last},
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_message_extract->get_lut(0, 1),
+          lut_message_extract->get_degree(1),
+          lut_message_extract->get_max_degree(1), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_overflow_last,
          gpu_memory_allocated);
-      break;
-    }
-    case outputFlag::FLAG_CARRY: { // Carry case

-      setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
-                                                    allocate_gpu_memory);
+      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
+      for (int index = 0; index < num_radix_blocks + 1; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
+          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
+          streams.gpu_index(0), allocate_gpu_memory);
+    }
+    if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case

      auto f_carry_last = [](Torus block) -> Torus {
        return ((block >> 2) & 1);
      };

-      lut_message_extract->generate_and_broadcast_lut(
-          active_streams, {0, 1}, {f_message_extract, f_carry_last},
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_message_extract->get_lut(0, 1),
+          lut_message_extract->get_degree(1),
+          lut_message_extract->get_max_degree(1), glwe_dimension,
+          polynomial_size, message_modulus, carry_modulus, f_carry_last,
          gpu_memory_allocated);
-      break;
-    }
-    default:
-      lut_message_extract->generate_and_broadcast_lut(
-          active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
-      break;
-    }

-    // lut_message_extract->broadcast_lut(active_streams);
+      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
+      for (int index = 0; index < num_radix_blocks + 1; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
+          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
+          streams.gpu_index(0), allocate_gpu_memory);
+    }
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
+    lut_message_extract->broadcast_lut(active_streams);
  };

  void release(CudaStreams streams) {
@@ -2472,7 +2508,7 @@ template <typename Torus> struct int_borrow_prop_memory {
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
    group_size = grouping_size;
-    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+    num_groups = CEIL_DIV(num_radix_blocks, grouping_size);

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
@@ -2500,11 +2536,16 @@ template <typename Torus> struct int_borrow_prop_memory {
      return (block >> 1) % message_modulus;
    };

+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
+        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_message_extract,
+        gpu_memory_allocated);
    active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

-    lut_message_extract->generate_and_broadcast_lut(
-        active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
+    lut_message_extract->broadcast_lut(active_streams);

    if (compute_overflow) {
      lut_borrow_flag =
@@ -2515,8 +2556,12 @@ template <typename Torus> struct int_borrow_prop_memory {
        return ((block >> 2) & 1);
      };

-      lut_borrow_flag->generate_and_broadcast_lut(
-          active_streams, {0}, {f_borrow_flag}, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
+          lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
+          message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
+      lut_borrow_flag->broadcast_lut(active_streams);
    }

    active_streams =
@@ -2537,7 +2582,9 @@ template <typename Torus> struct int_borrow_prop_memory {
  void release(CudaStreams streams) {

    shifted_blocks_borrow_state_mem->release(streams);
+    delete shifted_blocks_borrow_state_mem;
    prop_simu_group_carries_mem->release(streams);
+    delete prop_simu_group_carries_mem;
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   overflow_block, gpu_memory_allocated);

--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -37,14 +37,17 @@ template <typename Torus> struct int_mul_memory {
      zero_out_predicate_lut =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          zero_out_predicate_lut->get_lut(0, 0),
+          zero_out_predicate_lut->get_degree(0),
+          zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          zero_out_predicate_lut_f, gpu_memory_allocated);

      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {zero_out_predicate_lut_f},
-          gpu_memory_allocated);
-
-      // zero_out_predicate_lut->broadcast_lut(active_streams);
+      zero_out_predicate_lut->broadcast_lut(active_streams);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
          streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -52,7 +55,10 @@ template <typename Torus> struct int_mul_memory {
      return;
    }

+    auto glwe_dimension = params.glwe_dimension;
+    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
+    auto carry_modulus = params.carry_modulus;

    // 'vector_result_lsb' contains blocks from all possible shifts of
    // radix_lwe_left excluding zero ciphertext blocks
@@ -96,6 +102,18 @@ template <typename Torus> struct int_mul_memory {
      return (x * y) / message_modulus;
    };

+    // generate accumulators
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), lsb_acc,
+        luts_array->get_degree(0), luts_array->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        lut_f_lsb, gpu_memory_allocated);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0), msb_acc,
+        luts_array->get_degree(1), luts_array->get_max_degree(1),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        lut_f_msb, gpu_memory_allocated);
+
    // lut_indexes_vec for luts_array should be reinitialized
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
@@ -105,12 +123,9 @@ template <typename Torus> struct int_mul_memory {
          streams.stream(0), streams.gpu_index(0),
          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
          msb_vector_block_count);
-
    auto active_streams =
        streams.active_gpu_subset(total_block_count, params.pbs_type);
-    luts_array->generate_and_broadcast_bivariate_lut(
-        active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
-
+    luts_array->broadcast_lut(active_streams);
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, params, num_radix_blocks, 2 * num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
    uint32_t calculated_active_blocks =
        total_random_bits == 0
            ? 0
-            : (total_random_bits + message_bits_per_block - 1) /
-                  message_bits_per_block;
+            : CEIL_DIV(total_random_bits, message_bits_per_block);
    if (num_blocks_to_process != calculated_active_blocks) {
      PANIC(
          "num_blocks_to_process should be equal to calculated_active_blocks");
@@ -170,8 +169,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
    this->allocate_gpu_memory = allocate_gpu_memory;

    this->num_random_input_blocks =
-        (num_input_random_bits + message_bits_per_block - 1) /
-        message_bits_per_block;
+        CEIL_DIV(num_input_random_bits, message_bits_per_block);

    this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
        streams, params, this->num_random_input_blocks, message_bits_per_block,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -85,11 +85,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
-
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
+          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          shift_lut_f, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
+      cur_lut_bivariate->broadcast_lut(active_streams);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
@@ -168,10 +172,16 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      }

      // right shift
+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
+          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          shift_lut_f, gpu_memory_allocated);
      auto active_streams =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
+      cur_lut_bivariate->broadcast_lut(active_streams);
+
      lut_buffers_bivariate.push_back(cur_lut_bivariate);
    }
  }
@@ -261,11 +271,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return shifted | padding;
      };

+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          shift_last_block_lut_univariate->get_lut(0, 0),
+          shift_last_block_lut_univariate->get_degree(0),
+          shift_last_block_lut_univariate->get_max_degree(0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
      auto active_streams_shift_last =
          streams.active_gpu_subset(1, params.pbs_type);
-      shift_last_block_lut_univariate->generate_and_broadcast_lut(
-          active_streams_shift_last, {0}, {last_block_lut_f},
-          gpu_memory_allocated);
+      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
    }
@@ -283,8 +298,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      return (params.message_modulus - 1) * x_sign_bit;
    };

-    padding_block_lut_univariate->generate_and_broadcast_lut(
-        active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        padding_block_lut_univariate->get_lut(0, 0),
+        padding_block_lut_univariate->get_degree(0),
+        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        padding_block_lut_f, gpu_memory_allocated);
+    // auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
+    padding_block_lut_univariate->broadcast_lut(active_streams);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);

@@ -317,11 +339,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        return message_of_current_block + carry_of_previous_block;
      };

+      generate_device_accumulator_bivariate<Torus>(
+          streams.stream(0), streams.gpu_index(0),
+          shift_blocks_lut_bivariate->get_lut(0, 0),
+          shift_blocks_lut_bivariate->get_degree(0),
+          shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-      shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
-          active_streams_shift_blocks, {0}, {blocks_lut_f},
-          gpu_memory_allocated);
+      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -113,21 +113,27 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
      else
        return current_bit;
    };
-    ;
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
+        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
    auto active_gpu_count_mux = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);
-
-    mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
-                                        gpu_memory_allocated);
+    mux_lut->broadcast_lut(active_gpu_count_mux);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
      return x % params.message_modulus;
    };
-
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
+        cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
    auto active_gpu_count_cleaning =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    cleaning_lut->generate_and_broadcast_lut(
-        active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
+    cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -74,26 +74,45 @@ template <typename Torus> struct int_overflowing_sub_memory {
                                           luts_array, size_tracker,
                                           allocate_gpu_memory, size_tracker);

+    auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
+    auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
+
+    // generate luts (aka accumulators)
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
+        luts_array->get_degree(0), luts_array->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        f_lut_does_block_generate_carry, gpu_memory_allocated);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        lut_does_block_generate_or_propagate, luts_array->get_degree(1),
+        luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
+        gpu_memory_allocated);
    if (allocate_gpu_memory)
      cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                  luts_array->get_lut_indexes(0, 1), 1,
                                  num_radix_blocks - 1);

+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        luts_borrow_propagation_sum->get_lut(0, 0),
+        luts_borrow_propagation_sum->get_degree(0),
+        luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        f_luts_borrow_propagation_sum, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
+        message_acc->get_degree(0), message_acc->get_max_degree(0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        f_message_acc, gpu_memory_allocated);
+
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
-    luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
-        active_streams, {0}, {f_luts_borrow_propagation_sum},
-        gpu_memory_allocated);
-
-    luts_array->generate_and_broadcast_lut(
-        active_streams, {0, 1},
-        {f_lut_does_block_generate_carry,
-         f_lut_does_block_generate_or_propagate},
-        gpu_memory_allocated);
-    // generate luts (aka accumulators)
-
-    message_acc->generate_and_broadcast_lut(
-        active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
+    luts_array->broadcast_lut(active_streams);
+    luts_borrow_propagation_sum->broadcast_lut(active_streams);
+    message_acc->broadcast_lut(active_streams);
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -7,7 +7,8 @@
 #include <functional>
 #include <vector>

-const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
+// If we use more than 5 streams the result is incorrect
+const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;

 template <typename Torus> struct int_equality_selectors_buffer {
  int_radix_params params;
@@ -175,8 +176,7 @@ template <typename Torus> struct int_possible_results_buffer {
    this->lut_stride =
        (ciphertext_modulus / this->max_luts_per_call) * box_size;

-    this->num_lut_accumulators =
-        (total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
+    this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);

    stream_luts =
        new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
@@ -298,10 +298,14 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
      int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
          streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);

-      lut->generate_and_broadcast_lut(
-          streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
-          allocate_gpu_memory);
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
+          lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          id_fn, allocate_gpu_memory);

+      lut->broadcast_lut(
+          streams.active_gpu_subset(num_blocks, params.pbs_type));
      this->stream_identity_luts[i] = lut;
    }

@@ -314,17 +318,27 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->message_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-
-    this->message_extract_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
-        allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->message_extract_lut->get_lut(0, 0),
+        this->message_extract_lut->get_degree(0),
+        this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        msg_fn, allocate_gpu_memory);
+    this->message_extract_lut->broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
-
-    this->carry_extract_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
-        allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->carry_extract_lut->get_lut(0, 0),
+        this->carry_extract_lut->get_degree(0),
+        this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        carry_fn, allocate_gpu_memory);
+    this->carry_extract_lut->broadcast_lut(
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -1171,9 +1185,15 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1183,9 +1203,14 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    this->cleanup_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
@@ -1351,9 +1376,15 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    this->prefix_sum_lut = new int_radix_lut<Torus>(
        streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);

-    this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {prefix_sum_fn}, allocate_gpu_memory);
+    generate_device_accumulator_bivariate<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->prefix_sum_lut->get_lut(0, 0),
+        this->prefix_sum_lut->get_degree(0),
+        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        prefix_sum_fn, allocate_gpu_memory);
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1363,9 +1394,14 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
    };
    this->cleanup_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
-    this->cleanup_lut->generate_and_broadcast_lut(
-        streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
-        {cleanup_fn}, allocate_gpu_memory);
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
+        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        cleanup_fn, allocate_gpu_memory);
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
@@ -73,9 +73,10 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
                                                int8_t **fp_ks_buffer,
                                                bool gpu_memory_allocated);

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count);
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium.h
@@ -1,24 +0,0 @@
-#ifndef TRIVIUM_H
-#define TRIVIUM_H
-
-#include "../integer/integer.h"
-
-extern "C" {
-uint64_t scratch_cuda_trivium_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
-
-void cuda_trivium_generate_keystream_64(
-    CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
-    const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
-    uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
-
-void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/trivium/trivium_utilities.h
@@ -1,295 +0,0 @@
-#ifndef TRIVIUM_UTILITIES_H
-#define TRIVIUM_UTILITIES_H
-#include "../integer/integer_utilities.h"
-
-/// Struct to hold the LUTs.
-template <typename Torus> struct int_trivium_lut_buffers {
-  // Bivariate AND Gate LUT:
-  // AND operation: f(a, b) = (a & 1) & (b & 1).
-  // This is a Bivariate PBS used for the non-linear parts of Trivium.
-  int_radix_lut<Torus> *and_lut;
-
-  // Univariate Identity LUT:
-  // MESSAGE EXTRACTION operation: f(x) = x & 1.
-  // This is a Univariate PBS used to "flush" the state: it resets the noise
-  // after additions and ensures the message stays within the binary message
-  // space.
-  int_radix_lut<Torus> *flush_lut;
-
-  int_trivium_lut_buffers(CudaStreams streams, const int_radix_params &params,
-                          bool allocate_gpu_memory, uint32_t num_trivium_inputs,
-                          uint64_t &size_tracker) {
-
-    constexpr uint32_t BATCH_SIZE = 64;
-    constexpr uint32_t MAX_AND_PER_STEP = 3;
-    uint32_t total_lut_ops = num_trivium_inputs * BATCH_SIZE * MAX_AND_PER_STEP;
-
-    this->and_lut = new int_radix_lut<Torus>(streams, params, 1, total_lut_ops,
-                                             allocate_gpu_memory, size_tracker);
-
-    std::function<Torus(Torus, Torus)> and_lambda =
-        [](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };
-
-    auto active_streams_and =
-        streams.active_gpu_subset(total_lut_ops, params.pbs_type);
-    this->and_lut->generate_and_broadcast_bivariate_lut(
-        active_streams_and, {0}, {and_lambda}, allocate_gpu_memory);
-    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
-
-    uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
-
-    this->flush_lut = new int_radix_lut<Torus>(
-        streams, params, 1, total_flush_ops, allocate_gpu_memory, size_tracker);
-
-    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
-      return x & 1;
-    };
-
-    auto active_streams_flush =
-        streams.active_gpu_subset(total_flush_ops, params.pbs_type);
-    this->flush_lut->generate_and_broadcast_lut(
-        active_streams_flush, {0}, {flush_lambda}, allocate_gpu_memory);
-    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
-  }
-
-  void release(CudaStreams streams) {
-    this->and_lut->release(streams);
-    delete this->and_lut;
-    this->and_lut = nullptr;
-
-    this->flush_lut->release(streams);
-    delete this->flush_lut;
-    this->flush_lut = nullptr;
-  }
-};
-
-/// Struct to hold the state and temporary workspaces required for
-/// Trivium execution on the GPU.
-///
-/// This struct manages the memory for the internal registers (A, B, C),
-/// temporary buffers used during the update function, and buffers used for
-/// packing data before and after PBS.
-template <typename Torus> struct int_trivium_state_workspaces {
-  // Trivium Internal State Registers:
-  // Register A: 93 bits
-  CudaRadixCiphertextFFI *a_reg;
-  // Register B: 84 bits
-  CudaRadixCiphertextFFI *b_reg;
-  // Register C: 111 bits
-  CudaRadixCiphertextFFI *c_reg;
-
-  // Shift Workspace:
-  // Used to manage bitshifting operations on the registers
-  CudaRadixCiphertextFFI *shift_workspace;
-
-  // Temporary Update Buffers:
-  // Intermediate buffers for the trivium update logic (t1, t2, t3)
-  CudaRadixCiphertextFFI *temp_t1;
-  CudaRadixCiphertextFFI *temp_t2;
-  CudaRadixCiphertextFFI *temp_t3;
-
-  // Buffers to hold the new values for the registers after an update step
-  CudaRadixCiphertextFFI *new_a;
-  CudaRadixCiphertextFFI *new_b;
-  CudaRadixCiphertextFFI *new_c;
-
-  // PBS Packing Buffers:
-  // Buffers for packing inputs into the bivariate lookup table (AND gate)
-  CudaRadixCiphertextFFI *packed_pbs_lhs;
-  CudaRadixCiphertextFFI *packed_pbs_rhs;
-  // Buffer for the output of the bivariate PBS
-  CudaRadixCiphertextFFI *packed_pbs_out;
-
-  // Flush/Cleanup Packing Buffers:
-  // Buffers for the "flush" LUT which cleans up noise after additions
-  CudaRadixCiphertextFFI *packed_flush_in;
-  CudaRadixCiphertextFFI *packed_flush_out;
-
-  int_trivium_state_workspaces(CudaStreams streams,
-                               const int_radix_params &params,
-                               bool allocate_gpu_memory, uint32_t num_inputs,
-                               uint64_t &size_tracker) {
-
-    this->a_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->b_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->c_reg = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->shift_workspace = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->shift_workspace,
-        128 * num_inputs, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    uint32_t batch_blocks = 64 * num_inputs;
-
-    this->temp_t1 = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->temp_t1, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->temp_t2 = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->temp_t2, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->temp_t3 = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->temp_t3, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->new_a = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->new_a, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->new_b = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->new_b, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->new_c = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->new_c, batch_blocks,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->packed_pbs_lhs = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_pbs_lhs,
-        3 * batch_blocks, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->packed_pbs_rhs = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_pbs_rhs,
-        3 * batch_blocks, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->packed_pbs_out = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_pbs_out,
-        3 * batch_blocks, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->packed_flush_in = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
-        4 * batch_blocks, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->packed_flush_out = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
-        4 * batch_blocks, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-  }
-
-  void release(CudaStreams streams, bool allocate_gpu_memory) {
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->a_reg, allocate_gpu_memory);
-    delete this->a_reg;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->b_reg, allocate_gpu_memory);
-    delete this->b_reg;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->c_reg, allocate_gpu_memory);
-    delete this->c_reg;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->shift_workspace, allocate_gpu_memory);
-    delete this->shift_workspace;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->temp_t1, allocate_gpu_memory);
-    delete this->temp_t1;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->temp_t2, allocate_gpu_memory);
-    delete this->temp_t2;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->temp_t3, allocate_gpu_memory);
-    delete this->temp_t3;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->new_a, allocate_gpu_memory);
-    delete this->new_a;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->new_b, allocate_gpu_memory);
-    delete this->new_b;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->new_c, allocate_gpu_memory);
-    delete this->new_c;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_pbs_lhs, allocate_gpu_memory);
-    delete this->packed_pbs_lhs;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_pbs_rhs, allocate_gpu_memory);
-    delete this->packed_pbs_rhs;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_pbs_out, allocate_gpu_memory);
-    delete this->packed_pbs_out;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_flush_in, allocate_gpu_memory);
-    delete this->packed_flush_in;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->packed_flush_out, allocate_gpu_memory);
-    delete this->packed_flush_out;
-  }
-};
-
-template <typename Torus> struct int_trivium_buffer {
-  int_radix_params params;
-  bool allocate_gpu_memory;
-  uint32_t num_inputs;
-
-  int_trivium_lut_buffers<Torus> *luts;
-  int_trivium_state_workspaces<Torus> *state;
-
-  int_trivium_buffer(CudaStreams streams, const int_radix_params &params,
-                     bool allocate_gpu_memory, uint32_t num_inputs,
-                     uint64_t &size_tracker) {
-    this->params = params;
-    this->allocate_gpu_memory = allocate_gpu_memory;
-    this->num_inputs = num_inputs;
-
-    this->luts = new int_trivium_lut_buffers<Torus>(
-        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
-
-    this->state = new int_trivium_state_workspaces<Torus>(
-        streams, params, allocate_gpu_memory, num_inputs, size_tracker);
-  }
-
-  void release(CudaStreams streams) {
-    luts->release(streams);
-    delete luts;
-    luts = nullptr;
-
-    state->release(streams, allocate_gpu_memory);
-    delete state;
-    state = nullptr;
-
-    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
-  }
-};
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
    uint32_t casting_output_dimension, uint32_t casting_ks_level,
    uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
-    const bool *is_boolean_array, uint32_t num_compact_lists,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    KS_TYPE casting_key_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    const bool *is_boolean_array, const uint32_t is_boolean_array_len,
+    uint32_t num_compact_lists, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -118,7 +118,8 @@ template <typename Torus> struct zk_expand_mem {
  zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
-                const bool *is_boolean_array, uint32_t num_compact_lists,
+                const bool *is_boolean_array,
+                const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
@@ -174,6 +175,40 @@ template <typename Torus> struct zk_expand_mem {
    message_and_carry_extract_luts = new int_radix_lut<Torus>(
        streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);

+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 0),
+        message_and_carry_extract_luts->get_degree(0),
+        message_and_carry_extract_luts->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 1),
+        message_and_carry_extract_luts->get_degree(1),
+        message_and_carry_extract_luts->get_max_degree(1),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 2),
+        message_and_carry_extract_luts->get_degree(2),
+        message_and_carry_extract_luts->get_max_degree(2),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
+        gpu_memory_allocated);
+
+    generate_device_accumulator<Torus>(
+        streams.stream(0), streams.gpu_index(0),
+        message_and_carry_extract_luts->get_lut(0, 3),
+        message_and_carry_extract_luts->get_degree(3),
+        message_and_carry_extract_luts->get_max_degree(3),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
+        gpu_memory_allocated);
+
    // We are always packing two LWEs. We just need to be sure we have enough
    // space in the carry part to store a message of the same size as is in the
    // message part.
@@ -236,14 +271,36 @@ template <typename Torus> struct zk_expand_mem {
      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
        auto lwe_index = i + num_packed_msgs * offset;
        auto lwe_index_in_list = i % num_lwes_in_kth;
+        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       lwe_index, num_packed_msgs * num_lwes);
        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
        h_indexes_out[lwe_index] =
            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        // If the input relates to a boolean, shift the LUT so the correct one
-        // with sanitization is used
+        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
+        // is_boolean_array tells us which input is a boolean and thus the
+        // related output needs boolean sanitization. It naturally has
+        // total_blocks entries, but h_indexes_out reaches
+        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
+        // the ceiling causes out-of-bounds access. Reading garbage "true" would
+        // set h_lut_indexes to an invalid index pointing to uninitialized
+        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
+        // to match.
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                       "Cuda error: index %d for is_boolean_array is out of "
+                       "bounds (len is %d)",
+                       h_indexes_out[lwe_index], is_boolean_array_len);
        auto boolean_offset =
            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+        PANIC_IF_FALSE(
+            h_lut_indexes[lwe_index] < 4,
+            "Cuda error: lut index is greater than the max possible value (3)");
      }
      offset += num_lwes_in_kth;
    }
@@ -258,13 +315,7 @@ template <typename Torus> struct zk_expand_mem {

    auto active_streams =
        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
-
-    message_and_carry_extract_luts->generate_and_broadcast_lut(
-        active_streams, {0, 1, 2, 3},
-        {message_extract_lut_f, carry_extract_lut_f,
-         message_extract_and_sanitize_bool_lut_f,
-         carry_extract_and_sanitize_bool_lut_f},
-        gpu_memory_allocated);
+    message_and_carry_extract_luts->broadcast_lut(active_streams);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
        active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -183,9 +183,10 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
      base_log, level_count, num_lwes);
 }

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count) {
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count) {
  host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
                                  static_cast<const uint64_t *>(input),
                                  static_cast<uint64_t *>(output), base_log,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -10,7 +10,6 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

@@ -351,6 +350,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
      Torus state =
          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
      uint32_t offset = i * level_count * (lwe_dimension_out + 1);
+#pragma unroll 1
      for (int j = 0; j < level_count; j++) {

        KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -363,16 +363,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    lwe_acc_out[shmem_index] = local_lwe_out;
  }

-  if (tid <= lwe_dimension_out) {
-    for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
-         offset /= 2) {
-      __syncthreads();
+  for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
+    __syncthreads();
+    if (tid <= lwe_dimension_out && threadIdx.y < offset) {
      lwe_acc_out[shmem_index] +=
          lwe_acc_out[shmem_index + offset * blockDim.x];
    }
-    if (threadIdx.y == 0)
-      block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
  }
+  if (tid <= lwe_dimension_out && threadIdx.y == 0)
+    block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
 }

 template <typename Torus, typename KSTorus>
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -12,12 +12,9 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
 // Finish the keyswitching operation and prepare GLWEs for accumulation.
 // 1. Finish the keyswitching computation partially performed with a GEMM:
 //  - negate the dot product between the GLWE and KSK polynomial
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -6,7 +6,7 @@
 #include "helper_multi_gpu.h"
 #include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <limits>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,4 +1,5 @@
 #include "device.h"
+#include "utils/helper.cuh"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <mutex>
@@ -6,6 +7,27 @@
 #include <cuda_profiler_api.h>
 #endif

+void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+}
+
+int validate_device_ptr(const void *ptr) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+  return attr.device;
+}
+
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
@@ -30,8 +52,9 @@ bool mem_pools_enabled = false;
 // better results.
 void cuda_setup_mempool(uint32_t caller_gpu_index) {
  if (!mem_pools_enabled) {
-    pool_mutex.lock();
-    if (mem_pools_enabled)
+    std::lock_guard lock(pool_mutex);
+    if (mem_pools_enabled) // double-check - mem_pools_enabled might have been
+                           // changed in a different thread
      return; // If mem pools are already enabled, we don't need to do anything

    // We do it only once for all GPUs
@@ -78,7 +101,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
    }
    // We return to the original gpu_index
    cuda_set_device(caller_gpu_index);
-    pool_mutex.unlock();
  }
 }

@@ -234,60 +256,61 @@ bool cuda_check_support_thread_block_clusters() {
 #endif
 }

-/// Copy memory to the GPU asynchronously
+/// Copy memory from the CPU to a GPU with size tracking.
+/// This copy is asynchronous only if the CPU memory was pinned, i.e.
+/// allocated using cudaMallocHost. This was shown to come with a performance
+/// penalty if we allocate all CPU data in this way in the backend, so
+/// cudaMallocHost is only used in specific places where we need an
+/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
+/// example to copy the bootstrapping key).
+/// The copy only happens if gpu_memory_allocated is true.
 void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 uint64_t size,
                                                 cudaStream_t stream,
                                                 uint32_t gpu_index,
                                                 bool gpu_memory_allocated) {
+
+  GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
+
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }

-/// Copy memory to the GPU asynchronously
+/// Copy memory from the CPU to a GPU.
+/// This copy is asynchronous only if the CPU memory was pinned, i.e.
+/// allocated using cudaMallocHost. This was shown to come with a performance
+/// penalty if we allocate all CPU data in this way in the backend, so
+/// cudaMallocHost is only used in specific places where we need an
+/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
+/// example to copy the bootstrapping key).
 void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
  cuda_memcpy_with_size_tracking_async_to_gpu(dest, src, size, stream,
                                              gpu_index, true);
 }

-/// Copy memory within a GPU asynchronously
+/// Copy memory within a GPU asynchronously.
+/// The copy only happens if gpu_memory_allocated is true
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
    uint32_t gpu_index, bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  GPU_ASSERT(dest != nullptr,
-             "Cuda error: trying to copy gpu->gpu to null ptr");
-  GPU_ASSERT(src != nullptr,
-             "Cuda error: trying to copy gpu->gpu from null ptr");

-  cudaPointerAttributes attr_dest;
-  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  PANIC_IF_FALSE(
-      attr_dest.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
-  cudaPointerAttributes attr_src;
-  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  PANIC_IF_FALSE(
-      attr_src.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
+  int src_gpu_index = validate_device_ptr(src);
+  int dest_gpu_index = validate_device_ptr(dest);
  cuda_set_device(gpu_index);
-  if (attr_src.device == attr_dest.device) {
+  if (src_gpu_index == dest_gpu_index) {
    check_cuda_error(
        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
  } else {
-    check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
-                                         attr_src.device, size, stream));
+    check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
+                                         src_gpu_index, size, stream));
  }
 }
 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
@@ -327,21 +350,20 @@ void cuda_synchronize_device(uint32_t gpu_index) {
  check_cuda_error(cudaDeviceSynchronize());
 }

+/// cuda_memset sets bytes, we basically only use it to initialize data to 0
+/// The memset only happens if gpu_memory_allocated is true
 void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          uint64_t size, cudaStream_t stream,
                                          uint32_t gpu_index,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

+/// cuda_memset sets bytes, we basically only use it to initialize data to 0
 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cudaStream_t stream, uint32_t gpu_index) {
  cuda_memset_with_size_tracking_async(dest, val, size, stream, gpu_index,
@@ -366,7 +388,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    }
    cuda_set_device(gpu_index);
    int block_size = 256;
-    int num_blocks = (n + block_size - 1) / block_size;
+    int num_blocks = CEIL_DIV(n, block_size);

    // Launch the kernel
    cuda_set_value_kernel<Torus>
@@ -384,15 +406,15 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                                   uint32_t n);

 /// Copy memory to the CPU asynchronously
+/// This comes with a big penalty on performance even if the CPU
+/// memory is pinned (using cudaMallocHost for the CPU allocation),
+/// so it should be avoided at all costs
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
+  GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
  if (size == 0)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
-  }
+  validate_device_ptr_and_gpu_index(src, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -68,9 +68,15 @@ struct alignas(16) f128 {
    auto t = two_sum(a.lo, b.lo);

    double hi = s.hi;
+#ifdef __CUDA_ARCH__
+    double lo = __dadd_rn(s.lo, t.hi);
+    hi = __dadd_rn(hi, lo);
+    lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
+#else
    double lo = s.lo + t.hi;
    hi = hi + lo;
    lo = lo - (hi - s.hi);
+#endif

    return f128(hi, lo + t.lo);
  }
@@ -104,8 +110,13 @@ struct alignas(16) f128 {
  __host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
    auto s = two_diff(a.hi, b.hi);
    auto t = two_diff(a.lo, b.lo);
+#ifdef __CUDA_ARCH__
+    s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
+    return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
+#else
    s = quick_two_sum(s.hi, s.lo + t.hi);
    return quick_two_sum(s.hi, s.lo + t.lo);
+#endif
  }

  // Multiplication
@@ -220,16 +231,16 @@ struct f128x2 {
  // Subtraction
  __host__ __device__ friend f128x2 operator-(const f128x2 &a,
                                              const f128x2 &b) {
-    return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
-                  f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
+    return f128x2(f128::sub_estimate(a.re, b.re),
+                  f128::sub_estimate(a.im, b.im));
  }

  // Multiplication (complex multiplication)
  __host__ __device__ friend f128x2 operator*(const f128x2 &a,
                                              const f128x2 &b) {
+    const f128 a_im_b_im = f128::mul(a.im, b.im);
    f128 real_part =
-        f128::add(f128::mul(a.re, b.re),
-                  f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
+        f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
    f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
    return f128x2(real_part, imag_part);
  }
@@ -243,8 +254,8 @@ struct f128x2 {

  // Subtraction-assignment operator
  __host__ __device__ f128x2 &operator-=(const f128x2 &other) {
-    re = f128::add(re, f128(-other.re.hi, -other.re.lo));
-    im = f128::add(im, f128(-other.im.hi, -other.im.lo));
+    re = f128::sub_estimate(re, other.re);
+    im = f128::sub_estimate(im, other.im);
    return *this;
  }

@@ -261,12 +272,20 @@ struct f128x2 {
 };

 __host__ __device__ inline uint64_t double_to_bits(double d) {
+#ifdef __CUDA_ARCH__
+  uint64_t bits = __double_as_longlong(d);
+#else
  uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
+#endif
  return bits;
 }

 __host__ __device__ inline double bits_to_double(uint64_t bits) {
+#ifdef __CUDA_ARCH__
+  double d = __longlong_as_double(bits);
+#else
  double d = *reinterpret_cast<double *>(&bits);
+#endif
  return d;
 }

@@ -275,6 +294,8 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
  const double A = ONE << 52;
  const double B = ONE << 104;
  const double C = ONE << 76;
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double D = 340282366920938500000000000000000000000.;

  const __uint128_t threshold = (ONE << 104);
@@ -288,15 +309,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = A_bits | lower64;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - A;

    uint64_t B_bits = double_to_bits(B);
    uint64_t top64 = static_cast<uint64_t>(x >> 52);
    uint64_t bits_h = B_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
+#else
+    double l = l_temp - A;
    double h = h_temp - B;

    return (l + h);
+#endif

  } else {
    uint64_t C_bits = double_to_bits(C);
@@ -310,15 +336,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = C_bits | lower64 | mask_part;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - C;

    uint64_t D_bits = double_to_bits(D);
    uint64_t top64 = static_cast<uint64_t>(x >> 76);
    uint64_t bits_h = D_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
+#else
+    double l = l_temp - C;
    double h = h_temp - D;

    return (l + h);
+#endif
  }
 }

@@ -389,6 +420,8 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {

 __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  auto x = f128::sub_estimate(a, f128::f128_floor(a));
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__
  x.hi = __dmul_rn(x.hi, normalization);
@@ -398,7 +431,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  x.lo *= normalization;
 #endif

-  // TODO has to be round
+  x = f128::add_estimate(x, f128(0.5, 0.0));
  x = f128::f128_floor(x);

  __uint128_t x0 = f64_to_u128(x.hi);
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -12,8 +12,9 @@
 using Index = unsigned;

 #define NEG_TWID(i)                                                            \
-  f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]),               \
-         f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
+  f128x2(                                                                      \
+      f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])),  \
+      f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))

 #define F64x4_TO_F128x2(f128x2_reg, ind)                                       \
  f128x2_reg.re.hi = dt_re_hi[ind];                                            \
@@ -75,7 +76,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }
      tid = tid + STRIDE;
    }
    __syncthreads();
@@ -86,8 +91,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);
-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }
      w = NEG_TWID(tid / lane_mask + twiddle_shift);
      f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
      f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
@@ -151,7 +159,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      // keep one of the register for next iteration and store another one in sm
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }

      tid = tid + STRIDE;
    }
@@ -165,8 +177,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);

-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }

      tid = tid + STRIDE;
    }
@@ -218,7 +233,7 @@ __device__ void convert_u128_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re, const __uint128_t *in_im) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
@@ -241,7 +256,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -12,7 +12,7 @@
 #include "integer/subtraction.cuh"
 #include "pbs/programmable_bootstrap_classic.cuh"
 #include "pbs/programmable_bootstrap_multibit.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"

 // lwe_dimension + 1 threads
 // todo: This kernel MUST be refactored to a binary reduction
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(

  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -222,7 +222,7 @@ __host__ void is_at_least_one_comparisons_block_true(
  uint32_t remaining_blocks = num_radix_blocks;
  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -10,7 +10,122 @@
 #include "integer/integer.cuh"
 #include "linearalgebra/multiplication.cuh"
 #include "polynomial/functions.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
+
+/*
+ * =============================================================================
+ * GPU Compression/Decompression Algorithm: Overview
+ * =============================================================================
+ *
+ * The compression algorithm transforms standard LWE ciphertexts into a compact
+ * packed format. Decompression reverses this process.
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION INPUT (lwe_array_in)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                    lwe_array_in (GPU memory)                            |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------+---------------------------+-----------------+
+ *  |          LWE 0            |          LWE 1            |      ...        |
+ *  |      [mask, body]         |      [mask, body]         |                 |
+ *  +---------------------------+---------------------------+-----------------+
+ *  |<-- lwe_dimension + 1 -->|
+ *
+ *  Total LWEs: total_lwe_bodies_count (num_radix_blocks)
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION PROCESS
+ * -----------------------------------------------------------------------------
+ *
+ * 1. Message Shift (64-bit only):
+ *    Each LWE is multiplied by message_modulus to shift the message to MSB
+ *
+ * 2. Packing Keyswitch (LWE -> GLWE):
+ *    Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
+ *
+ *    +--------------------------------------------------------------+
+ *    |   lwe_per_glwe LWEs (input batch)                            |
+ *    |   LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1]                   |
+ *    +--------------------------------------------------------------+
+ *                              |
+ *                    Packing Keyswitch
+ *                              v
+ *    +--------------------------------------------------------------+
+ *    |            Single GLWE Ciphertext                            |
+ *    |   [A_0, A_1, ..., A_{k-1}, B]                                |
+ *    |   |<-- k * polynomial_size -->| |<-- polynomial_size -->|   |
+ *    +--------------------------------------------------------------+
+ *
+ *    Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
+ *                                             lwe_per_glwe)
+ *
+ * 3. Modulus Switch:
+ *    Reduce precision from 64-bit torus to storage_log_modulus bits
+ *
+ * 4. Bit Packing:
+ *    Pack multiple reduced-precision elements into dense bit representation
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                 tmp_glwe_array_out (intermediate buffer)                |
+ *  +-------------------------------------------------------------------------+
+ *  +----------------------------+----------------------------+---------------+
+ *  |         GLWE 0             |         GLWE 1             |    ...        |
+ *  |  [A_0..A_{k-1}, B_0..B_N]  |  [A_0..A_{k-1}, B_0..B_N]  |               |
+ *  +----------------------------+----------------------------+---------------+
+ *       |<-- glwe_accumulator_size = (k+1)*N -->|
+ *
+ *  Total size needed: num_glwes * glwe_accumulator_size elements
+ *  Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
+ *
+ * -----------------------------------------------------------------------------
+ * PACKED OUTPUT (glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |              Packed GLWE Ciphertext List (bit-packed)                   |
+ *  +-------------------------------------------------------------------------+
+ *  +-------------------------------------------------------------------------+
+ *  |  Elements packed with storage_log_modulus bits per original element    |
+ *  |  Total packed size: ceil(in_len * storage_log_modulus / 64) elements   |
+ *  +-------------------------------------------------------------------------+
+ *
+ * =============================================================================
+ * DECOMPRESSION (Extract) Algorithm
+ * =============================================================================
+ *
+ * Decompression receives an array of LWE indexes. For each index, it identifies
+ * the corresponding GLWE, extracts that GLWE from the packed representation,
+ * and then sample-extracts the requested LWE from the GLWE.
+ *
+ * -----------------------------------------------------------------------------
+ * EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |               Extracted GLWE Ciphertext                                 |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------------------+-----------------+---------------+
+ *  |    Mask (A polynomials)               |   Body (B)      |    Tail       |
+ *  |    [A_0, ..., A_{k-1}]                |   (body_count)  |   (zeroed)    |
+ *  |    k * polynomial_size elements       |   elements      |   elements    |
+ *  +---------------------------------------+-----------------+---------------+
+ *  |<------------------- initial_out_len ------------------->|               |
+ *  |<------------------------ glwe_ciphertext_size ------------------------->|
+ *
+ *  For the last GLWE, body_count may be less than polynomial_size (partial).
+ *  The tail region must be zeroed to ensure defined behavior.
+ *
+ *  tail_size = glwe_ciphertext_size - initial_out_len
+ *  tail_offset = initial_out_len  (NOT 0!)
+ *
+ * =============================================================================
+ */

 template <typename Torus>
 __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
@@ -66,7 +181,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,

  // number_bits_to_pack.div_ceil(Scalar::BITS)
  auto nbits = sizeof(Torus) * 8;
-  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
+  auto out_len = CEIL_DIV(number_bits_to_pack, nbits);

  int num_blocks = 0, num_threads = 0;
  getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
@@ -108,6 +223,8 @@ host_integer_compress(CudaStreams streams,
  uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
                        glwe_array_out->lwe_per_glwe - 1) /
                       glwe_array_out->lwe_per_glwe;
+  PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
+                 "Invalid number of GLWEs");

  // Keyswitch LWEs to GLWE
  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -200,8 +317,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;

-  uint32_t num_glwes =
-      (total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
+  uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);

  // Compressed length of the compressed GLWE we want to extract
  uint32_t body_count = 0;
@@ -218,19 +334,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;

-  // Calculates how many bits this particular GLWE shall use
-  auto number_bits_to_unpack = initial_out_len * log_modulus;
  auto nbits = sizeof(Torus) * 8;

-  // Calculates how many bits a full-packed GLWE shall use
-  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
-  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
+  // Calculate how many bits a full-packed GLWE uses, to determine
+  // the stride between consecutive packed GLWEs in the input buffer
+  auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
+  auto len = CEIL_DIV(number_bits_to_unpack, nbits);
  // Uses that length to set the input pointer
  auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;

  // Ensure the tail of the GLWE is zeroed
+  // The extract kernel writes initial_out_len elements starting at offset 0.
+  // We must zero the tail region (from initial_out_len to
+  // glwe_ciphertext_size)
  if (initial_out_len < glwe_ciphertext_size) {
-    cuda_memset_async(glwe_array_out, 0,
+    cuda_memset_async(glwe_array_out + initial_out_len, 0,
                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
                      stream, gpu_index);
  }
@@ -368,7 +486,7 @@ host_integer_decompress(CudaStreams streams,
      /// gather data to GPU 0 we can copy back to the original indexing
      multi_gpu_scatter_lwe_async<Torus>(
          active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
-          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
          lut->active_streams.count(), num_blocks_to_decompress,
          compression_params.small_lwe_dimension + 1);

@@ -388,7 +506,7 @@ host_integer_decompress(CudaStreams streams,
      multi_gpu_gather_lwe_async<Torus>(
          active_streams, (Torus *)d_lwe_array_out->ptr, lwe_after_pbs_vec,
          lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-          lut->lwe_aligned_vec, num_blocks_to_decompress,
+          lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_decompress,
          encryption_params.big_lwe_dimension + 1);

      /// Synchronize all GPUs
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -14,7 +14,6 @@
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <algorithm>
 #include <functional>

@@ -273,8 +272,7 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
      }
    }

-    if ((blockIdx.x / group_size + 1) <
-        (blocks_count + group_size - 1) / group_size) {
+    if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
      size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
      size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
@@ -363,7 +361,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
      num_radix_blocks > src1->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks should have more "
          "blocks than the number used in sum in groups")
-  auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
+  auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
  if (src2->num_radix_blocks < num_groups)
    PANIC("Cuda error: second input in sum in groups should have at least "
          "num_groups blocks")
@@ -570,8 +568,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -594,7 +592,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)lwe_array_out->ptr, lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
+        big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
        active_streams);
@@ -674,8 +673,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -791,8 +790,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -815,7 +814,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)(lwe_array_out->ptr), lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
+        big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
        active_streams);
@@ -1067,85 +1067,6 @@ void generate_device_accumulator_bivariate(
  POP_RANGE()
 }

-template <typename Torus> struct int_lut_cache {
-  int_lut_cache() {}
-
-  Torus *get_cached_univariate_lut(std::function<Torus(Torus)> &f, uint64_t *degree,
-                        uint64_t *max_degree, uint32_t glwe_dimension,
-                        uint32_t polynomial_size,
-                        uint32_t input_message_modulus,
-                        uint32_t input_carry_modulus,
-                        uint32_t output_message_modulus,
-                        uint32_t output_carry_modulus) {
-    /*__int128_t f_hash = 0;
-    uint32_t bits_per_lut_val = 5;
-    uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
-    for (uint32_t i = 0; i < input_modulus_sup; ++i) {
-      Torus f_eval = f(i);
-      GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
-                 "LUT value expected bitwidth overflow");
-      f_hash |= f_eval;
-      f_hash <<= bits_per_lut_val;
-    }
-
-    std::lock_guard cache_lock(_mutex);
-    if (_lut_cache.find(f_hash) != _lut_cache.end()) {
-      lut_ptr &ptr = _lut_cache[f_hash];
-      GPU_ASSERT(ptr.output_message_modulus == output_message_modulus,
-                 "Error modulus");
-      GPU_ASSERT(ptr.input_message_modulus == input_message_modulus,
-                 "Error modulus");
-      GPU_ASSERT(ptr.glwe_dimension == glwe_dimension, "Error modulus");
-      *max_degree = ptr.max_degree;
-      *degree = ptr.degree;
-      return ptr.ptr;
-    }*/
-
-    // host lut
-    Torus *h_lut =
-        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-    *max_degree = input_message_modulus * input_carry_modulus - 1;
-    *degree = generate_lookup_table_with_encoding<Torus>(
-        h_lut, glwe_dimension, polynomial_size, input_message_modulus,
-        input_carry_modulus, output_message_modulus, output_carry_modulus, f);
-
-    /*lut_ptr new_ptr = {h_lut,
-                       glwe_dimension,
-                       input_message_modulus,
-                       input_carry_modulus,
-                       output_message_modulus,
-                       output_carry_modulus,
-                       *max_degree,
-                       *degree};*/
-    //_lut_cache[f_hash] = new_ptr;
-    return h_lut;
-  }
-
-  ~int_lut_cache() {
-    std::lock_guard cache_lock(_mutex);
-    for (auto v : _lut_cache) {
-      free(v.second.ptr);
-    }
-    _lut_cache.clear();
-  }
-
-private:
-  struct lut_ptr {
-    Torus *ptr;
-    uint32_t glwe_dimension;
-    uint32_t input_message_modulus;
-    uint32_t input_carry_modulus;
-    uint32_t output_message_modulus;
-    uint32_t output_carry_modulus;
-    uint64_t max_degree;
-    uint64_t degree;
-  };
-  std::map<__int128_t, lut_ptr> _lut_cache;
-  std::mutex _mutex;
-};
-static int_lut_cache<uint64_t> g_LutCache64;
-
 /*
 *  generate bivariate accumulator with factor scaling for device pointer
 *    v_stream - cuda stream
@@ -1177,8 +1098,8 @@ void generate_device_accumulator_bivariate_with_factor(
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
      gpu_memory_allocated);

-//  cuda_synchronize_stream(stream, gpu_index);
-//  free(h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }
 /*
 *  generate bivariate accumulator for device pointer
@@ -1224,36 +1145,23 @@ void generate_device_accumulator_with_encoding(
    uint32_t output_message_modulus, uint32_t output_carry_modulus,
    std::function<Torus(Torus)> f, bool gpu_memory_allocated) {

-  static constexpr auto is_u64 = std::is_same_v<Torus, uint64_t>;
-  Torus *h_lut = nullptr;
  // host lut
-  if constexpr (is_u64) {
-    h_lut = g_LutCache64.get_cached_univariate_lut(
-        f, degree, max_degree, glwe_dimension, polynomial_size,
-        input_message_modulus, input_carry_modulus, output_message_modulus,
-        output_carry_modulus);
-  } else {
-    h_lut =
-        (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  *max_degree = input_message_modulus * input_carry_modulus - 1;
+  // fill accumulator
+  *degree = generate_lookup_table_with_encoding<Torus>(
+      h_lut, glwe_dimension, polynomial_size, input_message_modulus,
+      input_carry_modulus, output_message_modulus, output_carry_modulus, f);

-    *max_degree = input_message_modulus * input_carry_modulus - 1;
-    // fill accumulator
-    *degree = generate_lookup_table_with_encoding<Torus>(
-        h_lut, glwe_dimension, polynomial_size, input_message_modulus,
-        input_carry_modulus, output_message_modulus, output_carry_modulus, f);
-  }
-/*
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_with_size_tracking_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);
-*/
-  if (!std::is_same_v<Torus, uint64_t>) {
-    cuda_synchronize_stream(stream, gpu_index);
-    free(h_lut);
-  }
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }
-
 template <typename Torus>
 void generate_device_accumulator_with_encoding_with_cpu_prealloc(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
@@ -1356,8 +1264,8 @@ void generate_many_lut_device_accumulator(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index, gpu_memory_allocated);

-  //cuda_synchronize_stream(stream, gpu_index);
-  //free(h_lut);
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
  POP_RANGE()
 }

@@ -2434,7 +2342,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,

  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
  // from the output ct
-  auto active_streams = streams.active_gpu_subset(
+  auto active_streams = streams.active_gpu_subset_u128(
      lwe_array_out->num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<InputTorus>(
@@ -2465,8 +2373,9 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
    multi_gpu_scatter_lwe_async<InputTorus>(
        active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_scatter_vec, lut->active_streams.count(),
-        lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
+        lut->lwe_aligned_scatter_vec, lut->event_pool,
+        lut->active_streams.count(), lwe_array_out->num_radix_blocks,
+        lut->input_big_lwe_dimension + 1);

    execute_keyswitch_async<InputTorus>(
        active_streams, lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -2489,7 +2398,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
    multi_gpu_gather_lwe_async<__uint128_t>(
        active_streams, (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
        nullptr, lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
-        lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
+        lut->event_pool, lwe_array_out->num_radix_blocks,
+        big_lwe_dimension + 1);

    /// Synchronize all GPUs
    streams.synchronize();
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -362,7 +362,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
  radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
                                num_radix_in_vec, chunk_size, needs_processing);
  int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
-  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
+  int part_count = CEIL_DIV(big_lwe_size, number_of_threads);
  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

  mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -10,7 +10,7 @@
 #include "device.h"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <iostream>
 #include <sstream>
 #include <string>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -54,7 +54,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
        active_streams.count(), num_blocks_to_process,
        mem_ptr->params.small_lwe_dimension + 1);
    POP_RANGE()
@@ -72,7 +72,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_blocks_to_process,
+        lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_process,
        mem_ptr->params.big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -4,8 +4,8 @@
 #include "device.h"
 #include "integer/integer.h"
 #include "integer/radix_ciphertext.h"
+#include "utils/helper.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"

 inline CudaLweCiphertextListFFI
 to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
@@ -5,8 +5,8 @@
 #include "integer/radix_ciphertext.h"
 #include "integer/rerand.h"
 #include "integer/rerand_utilities.h"
+#include "utils/helper.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include "zk/zk_utilities.h"

 template <typename Torus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -8,7 +8,7 @@

 #include "device.h"
 #include "helper_multi_gpu.h"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <stdio.h>

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -12,7 +12,7 @@
 #include "integer/scalar_mul.h"
 #include "multiplication.cuh"
 #include "scalar_shifts.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <stdio.h>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -116,7 +116,7 @@ __host__ void host_integer_overflowing_sub(
      radix_params.message_modulus * radix_params.carry_modulus;
  uint32_t num_bits_in_block = log2_int(block_modulus);
  uint32_t grouping_size = num_bits_in_block;
-  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
+  uint32_t num_groups = CEIL_DIV(num_blocks, grouping_size);

  host_unchecked_sub_with_correcting_term<Torus>(
      streams.stream(0), streams.gpu_index(0), output, input_left, input_right,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
@@ -175,14 +175,17 @@ __host__ void host_aggregate_one_hot_vector(
    Torus *const *ksks) {

  int_radix_params params = mem_ptr->params;
+  if (params.message_modulus > 4 && params.carry_modulus > 4) {
+    PANIC("Cuda error: aggregate one hot vector is only implemented for 1_1 "
+          "and 2_2 params");
+  }
  uint32_t chunk_size = mem_ptr->chunk_size;
  uint32_t num_streams = mem_ptr->num_streams;

  mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
      streams);

-  uint32_t inputs_per_stream =
-      (num_input_ciphertexts + num_streams - 1) / num_streams;
+  uint32_t inputs_per_stream = CEIL_DIV(num_input_ciphertexts, num_streams);

  for (uint32_t s = 0; s < num_streams; s++) {

@@ -210,7 +213,7 @@ __host__ void host_aggregate_one_hot_vector(
    if (count_in_stream == 0)
      continue;

-    uint32_t num_chunks = (count_in_stream + chunk_size - 1) / chunk_size;
+    uint32_t num_chunks = CEIL_DIV(count_in_stream, chunk_size);

    //
    // Process chunks of input ciphertexts for the current stream
@@ -255,7 +258,10 @@ __host__ void host_aggregate_one_hot_vector(

  //
  // Aggregate partial results from all streams into the final aggregated vector
+  // num_streams has to be less than the max noise level otherwise we accumulate
+  // too much and the noise limit is exceeded
  //
+  CHECK_NOISE_LEVEL(num_streams, params.message_modulus, params.carry_modulus);
  for (uint32_t s = 1; s < num_streams; s++) {
    uint32_t start_idx = s * inputs_per_stream;
    if (start_idx >= num_input_ciphertexts)
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -8,7 +8,7 @@
 #include "helper_multi_gpu.h"
 #include "integer/integer.h"
 #include "integer/integer_utilities.h"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <stdio.h>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -6,7 +6,7 @@
 #include <cuda_runtime.h>
 #endif

-#include "../utils/kernel_dimensions.cuh"
+#include "../utils/helper.cuh"
 #include "device.h"
 #include "linear_algebra.h"
 #include <fstream>
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -6,7 +6,7 @@
 #include <cuda_runtime.h>
 #endif

-#include "../utils/kernel_dimensions.cuh"
+#include "../utils/helper.cuh"
 #include "device.h"
 #include "linear_algebra.h"

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -211,6 +211,8 @@ __global__ void device_programmable_bootstrap_amortized(
  // For the mask it's more complicated
  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
                                     glwe_dimension);
+
+  // No need to sync here, it is already synchronized after add_to_torus
  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
                                     glwe_dimension);
 }
@@ -303,7 +305,9 @@ __host__ void host_programmable_bootstrap_amortized(
    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap amortized only supports 64-bit "
+                 "Torus type.");
  uint64_t SM_FULL =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size, glwe_dimension);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -173,6 +173,7 @@ __global__ void device_programmable_bootstrap_cg(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -184,7 +185,8 @@ __global__ void device_programmable_bootstrap_cg(
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-
+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -247,7 +249,9 @@ __host__ void host_programmable_bootstrap_cg(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    uint32_t num_many_lut, uint32_t lut_stride) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap cg only supports 64-bit "
+                 "Torus type.");
  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
  uint64_t full_sm =
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -151,9 +151,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }

      } else if (blockIdx.y == glwe_dimension) {
-
+        __syncthreads();
        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {

@@ -165,6 +164,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];

+            // No need to sync, it is already synchronized before the first
+            // sample_extract_body call
            sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                               accumulator, 0, i * lut_stride);
          }
@@ -302,7 +303,10 @@ __host__ void execute_cg_external_product_loop(
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit cg only supports 64-bit "
+      "Torus type.");
  uint64_t full_sm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -239,6 +239,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -251,6 +252,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -409,7 +412,9 @@ __host__ void execute_step_one(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap step one only supports 64-bit "
+                 "Torus type.");
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
@@ -451,7 +456,9 @@ __host__ void execute_step_two(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    uint32_t num_many_lut, uint32_t lut_stride) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap step two only supports 64-bit "
+                 "Torus type.");
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -44,30 +44,31 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
@@ -91,30 +92,31 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -240,10 +240,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // in case they're not synchronized
      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
    }
  } else {
-    // Persist the updated accumulator
+    // We don't sync here because we use same indexes to read from `accumulator`
+    // as it was used in `add_to_torus_128` to write inside it Persist the
+    // updated accumulator
    tid = threadIdx.x;
    for (int i = 0; i < params::opt; i++) {
      global_slice[tid] = accumulator[tid];
@@ -395,6 +398,7 @@ __global__ void device_programmable_bootstrap_cg_128(
                                               accumulator);

    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
                                               0);
    }
@@ -571,38 +575,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
          input_lwe_ciphertext_count, max_shared_memory)) {
    switch (polynomial_size) {
    case 256:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<256>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 512:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<512>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 1024:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<1024>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 2048:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<2048>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 4096:
+      // We use AmortizedDegree for 4096 to avoid register exhaustion
      return scratch_programmable_bootstrap_cg_128<InputTorus,
                                                   AmortizedDegree<4096>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
@@ -618,38 +619,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
  } else {
    switch (polynomial_size) {
    case 256:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<256>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<256>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 512:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<512>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<512>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 1024:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<1024>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<1024>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 2048:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<2048>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<2048>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 4096:
+      // We use AmortizedDegree for 4096 to avoid register exhaustion
      return scratch_programmable_bootstrap_128<InputTorus,
                                                AmortizedDegree<4096>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
@@ -956,22 +954,19 @@ __host__ bool supports_cooperative_groups_on_programmable_bootstrap_128(
    uint32_t max_shared_memory) {
  switch (polynomial_size) {
  case 256:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                              max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<256>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 512:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                              max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<512>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 1024:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                               max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<1024>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 2048:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                               max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<2048>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
        AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
                               max_shared_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -461,6 +461,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -473,6 +474,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync here, it is already synchronized after add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             global_slice, 0, i * lut_stride);
        }
@@ -663,7 +665,9 @@ __host__ void execute_compute_keybundle(
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: PBS keybundle only supports 64-bit "
+                 "Torus type.");
  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint64_t chunk_size = std::min(
      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
@@ -735,7 +739,10 @@ __host__ void execute_step_one(
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit step one only supports 64-bit "
+      "Torus type.");
  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
          polynomial_size);
@@ -789,7 +796,10 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
                 uint32_t level_count, uint32_t j, uint32_t num_many_lut,
                 uint32_t lut_stride) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit step two only supports 64-bit "
+      "Torus type.");
  uint32_t lwe_chunk_size = (uint32_t)(buffer->lwe_chunk_size);
  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -11,29 +11,30 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
  switch (polynomial_size) {
  case 256:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<256>>(
+                                                        Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 512:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<512>>(
+                                                        Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 1024:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<1024>>(
+                                                        Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 2048:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<2048>>(
+                                                        Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
                                                        AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
@@ -55,30 +56,31 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(

  switch (polynomial_size) {
  case 256:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<256>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 512:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<512>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 1024:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<1024>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 2048:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<2048>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    return scratch_cg_multi_bit_programmable_bootstrap_128<
        InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
@@ -97,7 +99,7 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  bool supports_cg =
-      supports_cooperative_groups_on_multibit_programmable_bootstrap<
+      supports_cooperative_groups_on_multibit_programmable_bootstrap_128<
          __uint128_t>(glwe_dimension, polynomial_size, level_count,
                       input_lwe_ciphertext_count,
                       cuda_get_max_shared_memory(gpu_index));
@@ -129,7 +131,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -137,7 +139,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -145,8 +147,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_multi_bit_programmable_bootstrap_128<InputTorus,
-                                              AmortizedDegree<1024>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -154,8 +155,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 2048:
-    host_multi_bit_programmable_bootstrap_128<InputTorus,
-                                              AmortizedDegree<2048>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -163,6 +163,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_multi_bit_programmable_bootstrap_128<InputTorus,
                                              AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
@@ -191,8 +192,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<256>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -200,8 +200,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<512>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -209,8 +208,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<1024>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -218,8 +216,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 2048:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<2048>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -227,6 +224,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
                                                 AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -333,6 +333,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<__uint128_t, params>(block_lwe_array_out,
                                               global_slice, 0);
      if (num_many_lut > 1) {
@@ -346,6 +347,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<__uint128_t, params>(
              next_block_lwe_array_out, global_slice, 0, i * lut_stride);
        }
@@ -505,10 +508,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }

      } else if (blockIdx.y == glwe_dimension) {
-
+        __syncthreads();
        sample_extract_body<__uint128_t, params>(block_lwe_array_out,
                                                 accumulator, 0);
-
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {

@@ -519,7 +521,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];
-
+            // No need to sync, it is already synchronized before the first
+            // sample_extract_body call
            sample_extract_body<__uint128_t, params>(
                next_block_lwe_array_out, accumulator, 0, i * lut_stride);
          }
@@ -1090,4 +1093,109 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128(
  return size_tracker;
 }

+// Verify if the grid size satisfies the cooperative group constraints
+template <typename Torus, class params>
+__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
+    int glwe_dimension, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+
+  // If Cooperative Groups is not supported, no need to check anything else
+  if (!cuda_check_support_cooperative_groups())
+    return false;
+
+  // Calculate the dimension of the kernel
+  uint64_t full_sm_cg_accumulate =
+      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
+          params::degree);
+  uint64_t partial_sm_cg_accumulate =
+      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
+          params::degree);
+
+  int thds = params::degree / params::opt;
+
+  // Get the maximum number of active blocks per streaming multiprocessors
+  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
+  int max_active_blocks_per_sm;
+
+  if (max_shared_memory < partial_sm_cg_accumulate) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            Torus, params, NOSM>,
+        thds, 0);
+  } else if (max_shared_memory < full_sm_cg_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  PARTIALSM>,
+        cudaFuncCachePreferShared);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            Torus, params, PARTIALSM>,
+        thds, partial_sm_cg_accumulate);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  FULLSM>,
+        cudaFuncCachePreferShared);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            Torus, params, FULLSM>,
+        thds, full_sm_cg_accumulate);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  // Get the number of streaming multiprocessors
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
+}
+
+// Verify if the grid size for the multi-bit kernel satisfies the cooperative
+// group constraints
+template <typename Torus>
+__host__ bool
+supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+  switch (polynomial_size) {
+  case 256:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<256>>(glwe_dimension, level_count, num_samples,
+                            max_shared_memory);
+  case 512:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<512>>(glwe_dimension, level_count, num_samples,
+                            max_shared_memory);
+  case 1024:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<1024>>(glwe_dimension, level_count, num_samples,
+                             max_shared_memory);
+  case 2048:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<2048>>(glwe_dimension, level_count, num_samples,
+                             max_shared_memory);
+  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  default:
+    PANIC(
+        "Cuda error (multi-bit PBS128): unsupported polynomial size. Supported "
+        "N's are powers of two"
+        " in the interval [256..4096].")
+  }
+}
+
 #endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -179,6 +179,7 @@ __global__ void device_programmable_bootstrap_tbc(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

      if (num_many_lut > 1) {
@@ -191,13 +192,17 @@ __global__ void device_programmable_bootstrap_tbc(
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-
+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
      }
    }
  }
+  // Before exiting the kernel we need to sync the cluster to ensure that
+  // other blocks can still access the dsm in the mul ggsw glwe
+  cluster.sync();
 }

 template <typename Torus, class params, sharedMemDegree SMD>
@@ -207,8 +212,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
    const Torus *__restrict__ lut_vector_indexes,
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes,
-    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
-    uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
+    const double2 *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
+    uint32_t num_many_lut, uint32_t lut_stride,
    PBS_MS_REDUCTION_T noise_reduction_type) {

  constexpr uint32_t level_count = 1;
@@ -254,9 +259,6 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
                  (glwe_dimension + 1)];

-  double2 *block_join_buffer =
-      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
-                   params::degree / 2];
  // Since the space is L1 cache is small, we use the same memory location for
  // the rotated accumulator and the fft accumulator, since we know that the
  // rotated array is not in use anymore by the time we perform the fft
@@ -279,13 +281,14 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
  Torus temp_a_hat = 0;
  for (int i = 0; i < lwe_dimension; i++) {

+    constexpr int WARP_SIZE = 32;
    // We calculate the modulus switch of a warp size of elements
-    if (i % 32 == 0 && (i + threadIdx.x % 32) < lwe_dimension) {
-      modulus_switch(block_lwe_array_in[i + threadIdx.x % 32], temp_a_hat,
-                     log_modulus);
+    if (i % WARP_SIZE == 0 && (i + threadIdx.x % WARP_SIZE) < lwe_dimension) {
+      modulus_switch(block_lwe_array_in[i + threadIdx.x % WARP_SIZE],
+                     temp_a_hat, log_modulus);
    }
    // each iteration we broadcast the corresponding ms previously calculated
-    Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % 32);
+    Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % WARP_SIZE);

    __syncthreads();
    Torus reg_acc_rotated[params::opt];
@@ -360,6 +363,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

      if (num_many_lut > 1) {
@@ -373,12 +377,16 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync here, it is already synchronized after add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
      }
    }
  }
+  // Before exiting the kernel we need to sync the cluster to ensure that
+  // other blocks can still access the dsm in the mul ggsw glwe or the
+  // ping pong buffers
  cluster.sync();
 }

@@ -458,6 +466,9 @@ __host__ void host_programmable_bootstrap_tbc(
    uint32_t num_many_lut, uint32_t lut_stride) {
  cuda_set_device(gpu_index);

+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap tbc only supports 64-bit "
+                 "Torus type.");
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  auto supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
@@ -543,8 +554,8 @@ __host__ void host_programmable_bootstrap_tbc(
          &config,
          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
          lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-          lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
-          lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
+          lwe_array_in, lwe_input_indexes, bootstrapping_key, lwe_dimension,
+          num_many_lut, lut_stride, noise_reduction_type));
    } else {
      config.dynamicSmemBytes = full_sm + minimum_sm_tbc;

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -156,6 +156,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
          }
        }
      } else if (blockIdx.y == glwe_dimension) {
+        __syncthreads();
        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {
@@ -167,7 +168,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];
-
+            // No need to sync, it is already synchronized before the first
+            // sample_extract_body call
            sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                               accumulator, 0, i * lut_stride);
          }
@@ -179,6 +181,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
          accumulator, global_accumulator_slice);
    }
  }
+  // Before exiting the kernel we need to sync the cluster to ensure that
+  // other blocks can still access the dsm in the mul ggsw glwe
+  cluster.sync();
 }

 // Specialized version for the multi-bit bootstrap using 2_2 params:
@@ -356,6 +361,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -367,7 +373,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-
+          // No need to sync here, it is already synchronized after
+          // add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -379,7 +386,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        reg_acc_rotated, global_accumulator_slice);
  }
  // Before exiting the kernel we need to sync the cluster to ensure that
-  //  that other blocks can still access the dsm in the ping pong buffer
+  // other blocks can still access the dsm in the ping pong buffer
  cluster.sync();
 }

@@ -523,6 +530,11 @@ __host__ void execute_tbc_external_product_loop(
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit tbc only supports 64-bit "
+      "Torus type.");
  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/dot_product.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/dot_product.cuh
@@ -3,8 +3,6 @@

 #include "crypto/torus.cuh"

-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
 #define CIRCULANT_BLOCKTILE 32
 // Make a circulant matrix that serves to multiply a polynomial
 // with another one. Each thread loads a part of the original
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Testé	c8b19c3f10	chore(ci): update slab-github-runner action to v1.5.0 This new version improves handling of asynchronous tasks related to Slab and GitHub API.	2026-02-11 13:59:49 +01:00
Agnes Leroy	7408ca1126	chore(gpu): add fallback for 4-l40 in CI	2026-02-11 13:59:49 +01:00
Arthur Meyre	8712746296	chore: bump TFHE-rs to 1.5.3	2026-02-11 13:59:49 +01:00
Arthur Meyre	271644f482	chore: bump tfhe-cuda-backend to 0.13.2	2026-02-11 13:59:49 +01:00
Arthur Meyre	5a7650a2de	chore: revert backward compatibility change - this change should not have been needed and poses risks for backward compatibility - HL CUDA: use dedicated type for the ReRand	2026-02-11 13:59:49 +01:00
Guillermo Oyarzun	a3a5b6fe40	fix(gpu): avoid unspecified behavior during the reduction	2026-02-11 13:55:47 +01:00
Pedro Alves	2210d637c4	fix(gpu): remove unused number_bits_to_unpack initialization in host_extract	2026-02-09 17:03:09 +01:00
Pedro Alves	1e5a38acb0	fix(gpu): assert all items have same shape in from_vec_cuda_lwe_ciphertexts_list	2026-02-09 17:03:09 +01:00
Pedro Alves	f6b08c45e9	fix(gpu): assert exactly one element in CudaLweCiphertextList::into_lwe_ciphertext	2026-02-09 17:03:09 +01:00
Pedro Alves	6145f6b680	fix(gpu): use lwe_compact_ciphertext_list_size for validation in CudaLweCompactCiphertextList::from_d_vec	2026-02-09 17:03:09 +01:00
Agnes Leroy	0d6602df84	chore(gpu): improve checks in device code	2026-02-09 17:03:09 +01:00
Agnes Leroy	a006d5d71f	fix(gpu): fix logic in lwe ciphertext list	2026-02-09 17:03:09 +01:00
Pedro Alves	a7c5f2dbf8	chore(gpu): replaces (a + b - 1) / b patterns in the cuda backend by CEIL_DIV - also, this commit renames kernel_dimensions.cuh to helper.cuh and copies the content of the older helper.cuh into helper_debug.cuh	2026-02-09 17:03:09 +01:00
Agnes Leroy	0816c331c5	chore(gpu): add some missing checks in core crypto	2026-02-09 17:03:09 +01:00
Agnes Leroy	89df44917b	chore(gpu): cleanup device.cu binding, remove _async fuinctions from core crypto	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	b218c98194	fix(gpu): clean unused variables in specialized classical pbs	2026-02-09 17:03:09 +01:00
Andrei Stoian	2355cf4d89	fix(gpu): valgrind error on leaks	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	3b793273cf	feat(gpu): create different threshold for multi-gpu pbs128	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	d3b52d92cb	fix(gpu): fix race condition in tbc implementations	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	af7d69e16c	feat(gpu): avoid register spilling memory in ff128	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	6bb211f2fd	feat(gpu): use 512 threads for pbs128 flavors	2026-02-09 17:03:09 +01:00
Agnes Leroy	0fcce501a2	fix(gpu): fix small cpu memory leak	2026-02-09 17:03:09 +01:00
Agnes Leroy	204555e11c	fix(gpu): fix noise level in match value	2026-02-09 17:03:09 +01:00
Arthur Meyre	b00e8bafe5	chore: bump TFHE-rs to version 1.5.2	2026-02-09 13:05:01 +01:00
Arthur Meyre	140a780bb6	chore: bump tfhe-cuda-backend to 0.13.1	2026-02-09 13:05:01 +01:00
Thomas Montaigu	e646130324	chore: move shortint expanded types into shortint mod The expanded types definitions were in the high level API as it was originally related to the XofKeySet feature. However, since it's now used even in non-xof setting we decided to move these types to shortint module where they conceptually belong	2026-02-09 09:14:25 +01:00
Thomas Montaigu	f79204e23e	feat(hlapi): add is_conformant for CompressedXofKeySet	2026-02-09 09:14:25 +01:00
Thomas Montaigu	b6b24f4fb9	fix(xofkeyset): generate multibit decompression key when params are multibit	2026-02-09 09:14:25 +01:00
Thomas Montaigu	41621dc4e3	fix(conformance): GGSW list had wrong group count	2026-02-09 09:14:25 +01:00
Thomas Montaigu	662038da19	chore(xof_key_set): make generate_with_pre_seeded_generator public MPC teams needs to be able to generate a CompressedXofKeySet from an existing ClientKey	2026-02-09 09:14:25 +01:00
Thomas Montaigu	9d21aecae9	feat(hlapi): add decompress_to_gpu for CompressedXofKeySet Done using the newly added expand + convert pattern	2026-02-09 09:14:25 +01:00
Thomas Montaigu	58dbdf7dd4	refactor(hlapi): add IntegerExpandedServerKey::convert_to_gpu And use it to convert from CompressedServerKey to CudaServerKey.	2026-02-09 09:14:25 +01:00
Thomas Montaigu	1a7b7ace47	refactor(hlapi): split gpu key conversion in expand/convert converting from CompressedServerKey (Cpu) to CudaServerKey was done via decompress_from_cpu/decompress_to_cuda methods. We refactor to split these functions in 2: one that converts from cpu (input in std domain for bootstrap keys), the other that decompress/expand then calls convert	2026-02-09 09:14:25 +01:00
Thomas Montaigu	7797b60ef2	refactor(hlapi): add expand() method to CompressedServerKey Add an expand method to CompressedServerKey that returns an IntegerExpandedServerKey. Refactor decompress() to use the new expand() then convert_to_cpu(). This will allow later to refactor the convertion from CompressedServerKey to CudaServerKey to follow a similar pattern, meaning we will be able to share the code that converts keys from CPU to GPU between the normal server key and the xof server key	2026-02-09 09:14:25 +01:00
Thomas Montaigu	56c0a9fa5e	fix(hlapi-gpu): correct state of post decompression noise squashed ct The GPU part did not set the correct state This problem is sort of rare to encounter as generally a CompressedSquashedNoiseCiphertextList is deserialized then used. When deserialized its on the CPU, so calls to `get` use CPU code which correctly set the state. This problem is thus visible when either: - safe_deserializing and manually moving the list to GPU - deserialize - directly expanding after the creation of the list	2026-02-09 09:14:25 +01:00
Thomas Montaigu	37bcb7763d	fix(hlapi): return error when trying to expand non packed list	2026-02-09 09:14:25 +01:00
Thomas Montaigu	ae978bedc6	refactor(xof_key_set): split decompression into expansion and conversion Introduce IntegerExpandedServerKey as an intermediate representation between compressed (seeded) keys and backend-specific formats. Decompression is now a two-step process: 1. Seed expansion: decompress seeded keys into standard domain representations (e.g., LweBootstrapKey instead of FourierLweBootstrapKey) 2. Backend conversion: convert to target backend format (CPU Fourier, GPU, etc.) This separation allows sharing the expansion step across backends while specializing only the final conversion, as for this XOF based expansion the order is important Changes: - Split xof_key_set.rs into module structure (mod.rs, internal.rs, test.rs) - Add intermediate types, that contains the expanded, but not converted data	2026-02-09 09:14:25 +01:00
David Testé	43ebb14b9b	doc: fix specs description of aws hpc7a.96xlarge instance These instances have two sockets, each equipped with a 96-core CPU.	2026-02-06 12:25:07 +01:00
David Testé	641b394423	doc: add svg tables to zero-knowledge benchmarks This removes the embedded GSheet. These SVGs display more operations and inputs that reflect real-use cases. Throughput is available only for server-side computation as it's meaningless to perform multiple proof in parallele on the client-side.	2026-02-02 18:05:08 +01:00
Beka Barbakadze	72c18cc2b2	fix(gpu): add __syncthread and threadIdx condition for sample_extract_body in all pbs versions	2026-01-30 20:21:15 +01:00
Pedro Alves	71e95c435d	chore(gpu): add test for single-item compact ciphertext list expand	2026-01-30 20:21:15 +01:00
Andrei Stoian	057217b7d2	fix(gpu): more crypto param checks in cuda backend	2026-01-30 20:21:15 +01:00
Beka Barbakadze	f006039916	refactor(gpu): refactor f128 and fft128 to prevent possible precision losses and improve performance	2026-01-30 20:21:15 +01:00
Pedro Alves	69df8c0310	fix(gpu): fix some inconsistencies in decompression that could enable access of not allocated memory - also adds a diagram explaining how compression / decompression work	2026-01-30 20:21:15 +01:00
Agnes Leroy	482e49eac5	fix(gpu): add check on nullptr for dest_indexes in many lut gather	2026-01-29 15:58:18 +01:00
Guillermo Oyarzun	0dcf2ace3b	fix(gpu): handling temporary events destruction	2026-01-29 15:58:18 +01:00
Pedro Alves	0483c95941	fix(gpu): fix an invalid access in expand when the number of LWEs is odd - also improves test_expander_length_matches_data_items - adds diagrams and explanations about GPU's expand	2026-01-29 15:58:18 +01:00
Guillermo Oyarzun	f9b292ca77	fix(gpu): add panic for 32-bit Torus calls	2026-01-29 15:58:18 +01:00
Agnes Leroy	8e194c4b65	chore(gpu): fix logic to check ptr validity in device.cu	2026-01-29 15:58:18 +01:00
Agnes Leroy	f02bba151c	chore(gpu): stop trying to enable NVlink since we don't use it	2026-01-29 15:58:18 +01:00
Agnes Leroy	a477548161	chore(gpu): add comments in device.cu	2026-01-29 15:58:18 +01:00
David Testé	45a7d5217c	chore(docs): update benchmark results for all backends	2026-01-26 17:08:40 +01:00
Andrei Stoian	865c0887ee	fix(gpu): mutex lock	2026-01-26 11:20:54 +01:00
Agnes Leroy	0b9df1c8cd	doc: add erc20 benchmark results for all backends	2026-01-26 11:20:54 +01:00
Pedro Alves	cf20e337ef	fix(gpu): fix an inconsistency between CudaCompactCiphertextListExpander::len() and the CPU equivalent	2026-01-23 15:37:18 +01:00
Agnes Leroy	d7aea61eba	fix(gpu): fix potential overflow in create_on_same_gpus	2026-01-23 15:37:18 +01:00
Arthur Meyre	9352ecce54	chore: bump version to 1.5.1	2026-01-20 14:17:55 +01:00
Arthur Meyre	179b52ab26	feat: add missing raw parts APIs for shortint (Compressed)DecompressionKey	2026-01-20 14:17:55 +01:00