chore(ci): update slab-github-runner action to v1.6.0

This action version now uses node24 as runner since node20 support is dropped on April 2026.
chore: add compressed xof key set backward to 1.5
2026-04-28 03:01:21 -04:00 · 2026-04-01 09:47:54 +02:00 · 2026-03-11 17:26:14 +01:00 · 2026-03-11 17:26:14 +01:00 · 2026-03-11 17:26:14 +01:00 · 2026-03-11 17:26:14 +01:00
231 changed files with 6619 additions and 3726 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -41,7 +41,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -141,7 +141,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -146,7 +146,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -299,7 +299,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -86,7 +86,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -87,7 +87,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -155,7 +155,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -39,7 +39,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_cpu_common.yml
+++ b/.github/workflows/benchmark_cpu_common.yml
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_ct_key_sizes.yml
+++ b/.github/workflows/benchmark_ct_key_sizes.yml
@@ -33,7 +33,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -134,7 +134,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_coprocessor.yml
+++ b/.github/workflows/benchmark_gpu_coprocessor.yml
@@ -92,7 +92,7 @@ jobs:
    steps:
      - name: Start remote instance
        id: start-remote-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -324,7 +324,7 @@ jobs:
    steps:
      - name: Stop remote instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -143,7 +143,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -383,7 +383,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -71,7 +71,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -213,7 +213,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build_common.yml
+++ b/.github/workflows/cargo_build_common.yml
@@ -80,7 +80,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -60,7 +60,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -31,7 +31,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -37,7 +37,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -43,7 +43,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -198,7 +198,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -79,7 +79,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -187,7 +187,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -38,7 +38,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_memory_sanitizer_h100.yml
+++ b/.github/workflows/gpu_memory_sanitizer_h100.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -81,7 +81,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -87,7 +87,7 @@ jobs:
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
        continue-on-error: true
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -82,7 +82,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
      - name: Stop instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -62,7 +62,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -34,7 +34,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -37,7 +37,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -221,7 +221,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Start remote instance
        id: start-remote-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
      - name: Stop remote instance
        id: stop-instance
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
+        uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.13.0"
+version = "0.13.2"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -10,11 +10,7 @@ extern std::mutex m;
 extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
 extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
-
-extern "C" {
-int32_t cuda_setup_multi_gpu(int device_0_id);
-}
-
+extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
 // Define a variant type that can be either a vector or a single pointer
 template <typename Torus>
 using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,

 uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                              PBS_TYPE pbs_type);
+uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
+                                   PBS_TYPE pbs_type);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -70,7 +68,7 @@ public:
  // Construct an empty set. Invalid use of an empty set should raise an error
  // right away through asserts or because of a nullptr dereference
  CudaStreams()
-      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
+      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count(0),
        _owns_streams(false) {}

  // Returns a subset of this set as an active subset. An active subset is one
@@ -80,7 +78,15 @@ public:
        _streams, _gpu_indexes,
        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
  }
-
+  // Returns a subset of this set as an active subset for pbs128. An active
+  // subset is one that is temporarily used to perform some computation. For
+  // pbs128, the threshold is different, because the original threshold was
+  // designed for 2_2 params.
+  CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
+    return CudaStreams(
+        _streams, _gpu_indexes,
+        get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
+  }
  // Returns a CudaStreams struct containing only the ith stream
  CudaStreams get_ith(int i) const {
    return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -114,11 +120,13 @@ public:
  // streams on the same GPU
  void create_on_same_gpus(const CudaStreams &other) {
    PANIC_IF_FALSE(_streams == nullptr,
-                   "Assign clone to non-empty cudastreams");
+                   "Cuda error: Assign clone to non-empty CudaStreams");
+    PANIC_IF_FALSE(_gpu_count <= 8,
+                   "Cuda error: GPU count should be in the interval [0, 8]");

    cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];

-    uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
+    uint32_t *gpu_indexes_clone = new uint32_t[other._gpu_count];
    for (uint32_t i = 0; i < other._gpu_count; ++i) {
      new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
      gpu_indexes_clone[i] = other._gpu_indexes[i];
@@ -170,6 +178,7 @@ public:
      _streams = nullptr;
      delete[] _gpu_indexes;
      _gpu_indexes = nullptr;
+      _gpu_count = 0;
    }
  }

@@ -483,4 +492,38 @@ public:
  }
 };

+// Event pool for managing temporary CUDA events in scatter/gather operations
+struct CudaEventPool {
+private:
+  std::vector<cudaEvent_t> _events;
+  std::vector<uint32_t> _gpu_indices;
+
+public:
+  CudaEventPool() {}
+
+  // Requests a new event from the pool (creates and stores it)
+  cudaEvent_t request_event(uint32_t gpu_index) {
+    cudaEvent_t event = cuda_create_event(gpu_index);
+    _events.push_back(event);
+    _gpu_indices.push_back(gpu_index);
+    return event;
+  }
+
+  // Releases all pooled events
+  // This should always be called in the release of the LUT, so streams
+  // are already synchronized
+  void release() {
+    for (size_t i = 0; i < _events.size(); i++) {
+      cuda_event_destroy(_events[i], _gpu_indices[i]);
+    }
+    _events.clear();
+    _gpu_indices.clear();
+  }
+
+  ~CudaEventPool() {
+    GPU_ASSERT(_events.empty(),
+               "CudaEventPool: must call release before destruction");
+  }
+};
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
    Torus total_modulus = params.message_modulus * params.carry_modulus;
    uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

-    int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
+    int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
    tmp_out = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -11,16 +11,26 @@ template <typename Torus> struct int_compression {
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
  uint32_t lwe_per_glwe;
+  uint32_t max_num_glwes;

+  // num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
+  // compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
+  // defined by the chosen parameter set
  int_compression(CudaStreams streams, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;

    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
                                     compression_params.polynomial_size;

+    // Calculate the actual number of GLWEs needed based on total radix blocks.
+    // This ensures we allocate enough memory when num_radix_blocks >
+    // lwe_per_glwe.
+    max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
+
    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
            sizeof(Torus),
@@ -28,7 +38,7 @@ template <typename Torus> struct int_compression {
        allocate_gpu_memory));
    tmp_glwe_array_out =
        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
-            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
+            max_num_glwes * glwe_accumulator_size * sizeof(Torus),
            streams.stream(0), streams.gpu_index(0), size_tracker,
            allocate_gpu_memory));

--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -34,8 +34,10 @@ public:
    } else if ((msg_mod) == 4 && (carry_mod) == 4) {                           \
      constexpr int max_noise_level = 5;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
-        PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 "    \
-              "parameters");                                                   \
+        PANIC(                                                                 \
+            "Cuda error: noise %d exceeds maximum authorized value 5 for 2_2"  \
+            " parameters",                                                     \
+            noise_level_expr);                                                 \
    } else if ((msg_mod) == 8 && (carry_mod) == 8) {                           \
      constexpr int max_noise_level = 9;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
@@ -349,6 +351,7 @@ struct int_radix_lut_custom_input_output {

  CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
  CudaStreamsBarrier multi_gpu_gather_barrier;
+  CudaEventPool event_pool;

  // Setup the LUT configuration:
  // input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
@@ -371,8 +374,13 @@ struct int_radix_lut_custom_input_output {
    this->num_input_blocks = num_input_blocks;
    this->gpu_memory_allocated = allocate_gpu_memory;

-    this->active_streams =
-        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    if (sizeof(OutputTorus) == 16) {
+      this->active_streams =
+          streams.active_gpu_subset_u128(num_radix_blocks, params.pbs_type);
+    } else {
+      this->active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
+    }
  }

  void setup_degrees() {
@@ -384,9 +392,13 @@ struct int_radix_lut_custom_input_output {
  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {

+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    for (uint i = 0; i < active_streams.count(); i++) {
      cuda_set_device(active_streams.gpu_index(i));
@@ -458,11 +470,14 @@ struct int_radix_lut_custom_input_output {
                               lwe_trivial_indexes, num_radix_blocks,
                               allocate_gpu_memory);
  }
-
  void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    auto inputs_on_gpu = std::min(
        (int)num_input_blocks,
@@ -809,10 +824,13 @@ struct int_radix_lut_custom_input_output {
  void allocate_lwe_vector_for_non_trivial_indexes(
      CudaStreams streams, uint64_t max_num_radix_blocks,
      uint64_t &size_tracker, bool allocate_gpu_memory) {
-
+    int classical_threshold =
+        sizeof(OutputTorus) == 16
+            ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+            : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                        : classical_threshold;

    // We need to create the auxiliary array only in GPU 0
    if (active_streams.count() > 1) {
@@ -865,6 +883,7 @@ struct int_radix_lut_custom_input_output {

    if (active_streams.count() > 1) {
      active_streams.synchronize();
+      event_pool.release();
      multi_gpu_gather_barrier.release();
      multi_gpu_broadcast_barrier.release();
      multi_gpu_scatter_barrier.release();
@@ -2055,7 +2074,7 @@ template <typename Torus> struct int_sc_prop_memory {
    uint32_t block_modulus = message_modulus * carry_modulus;
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
-    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+    num_groups = CEIL_DIV(num_radix_blocks, grouping_size);

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
@@ -2489,7 +2508,7 @@ template <typename Torus> struct int_borrow_prop_memory {
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
    group_size = grouping_size;
-    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
+    num_groups = CEIL_DIV(num_radix_blocks, grouping_size);

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
@@ -2563,7 +2582,9 @@ template <typename Torus> struct int_borrow_prop_memory {
  void release(CudaStreams streams) {

    shifted_blocks_borrow_state_mem->release(streams);
+    delete shifted_blocks_borrow_state_mem;
    prop_simu_group_carries_mem->release(streams);
+    delete prop_simu_group_carries_mem;
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   overflow_block, gpu_memory_allocated);

--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
    uint32_t calculated_active_blocks =
        total_random_bits == 0
            ? 0
-            : (total_random_bits + message_bits_per_block - 1) /
-                  message_bits_per_block;
+            : CEIL_DIV(total_random_bits, message_bits_per_block);
    if (num_blocks_to_process != calculated_active_blocks) {
      PANIC(
          "num_blocks_to_process should be equal to calculated_active_blocks");
@@ -170,8 +169,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
    this->allocate_gpu_memory = allocate_gpu_memory;

    this->num_random_input_blocks =
-        (num_input_random_bits + message_bits_per_block - 1) /
-        message_bits_per_block;
+        CEIL_DIV(num_input_random_bits, message_bits_per_block);

    this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
        streams, params, this->num_random_input_blocks, message_bits_per_block,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -7,7 +7,8 @@
 #include <functional>
 #include <vector>

-const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
+// If we use more than 5 streams the result is incorrect
+const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;

 template <typename Torus> struct int_equality_selectors_buffer {
  int_radix_params params;
@@ -175,8 +176,7 @@ template <typename Torus> struct int_possible_results_buffer {
    this->lut_stride =
        (ciphertext_modulus / this->max_luts_per_call) * box_size;

-    this->num_lut_accumulators =
-        (total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
+    this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);

    stream_luts =
        new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
@@ -73,9 +73,10 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
                                                int8_t **fp_ks_buffer,
                                                bool gpu_memory_allocated);

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count);
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
    uint32_t casting_output_dimension, uint32_t casting_ks_level,
    uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
-    const bool *is_boolean_array, uint32_t num_compact_lists,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    KS_TYPE casting_key_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
+    const bool *is_boolean_array, const uint32_t is_boolean_array_len,
+    uint32_t num_compact_lists, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -118,7 +118,8 @@ template <typename Torus> struct zk_expand_mem {
  zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
-                const bool *is_boolean_array, uint32_t num_compact_lists,
+                const bool *is_boolean_array,
+                const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
@@ -270,14 +271,36 @@ template <typename Torus> struct zk_expand_mem {
      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
        auto lwe_index = i + num_packed_msgs * offset;
        auto lwe_index_in_list = i % num_lwes_in_kth;
+        PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       lwe_index, num_packed_msgs * num_lwes);
        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
        h_indexes_out[lwe_index] =
            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
-        // If the input relates to a boolean, shift the LUT so the correct one
-        // with sanitization is used
+        PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
+                       "Cuda error: index %d is beyond the max value %d",
+                       h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
+        // is_boolean_array tells us which input is a boolean and thus the
+        // related output needs boolean sanitization. It naturally has
+        // total_blocks entries, but h_indexes_out reaches
+        // message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
+        // the ceiling causes out-of-bounds access. Reading garbage "true" would
+        // set h_lut_indexes to an invalid index pointing to uninitialized
+        // memory instead of a real LUT. Rust pads is_boolean_array with FALSE
+        // to match.
+        PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
+                       "Cuda error: index %d for is_boolean_array is out of "
+                       "bounds (len is %d)",
+                       h_indexes_out[lwe_index], is_boolean_array_len);
        auto boolean_offset =
            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
+        PANIC_IF_FALSE(
+            h_lut_indexes[lwe_index] < 4,
+            "Cuda error: lut index is greater than the max possible value (3)");
      }
      offset += num_lwes_in_kth;
    }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -183,9 +183,10 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
      base_log, level_count, num_lwes);
 }

-void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
-                                   void const *input, void *output,
-                                   uint32_t base_log, uint32_t level_count) {
+void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
+                                         void const *input, void *output,
+                                         uint32_t base_log,
+                                         uint32_t level_count) {
  host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
                                  static_cast<const uint64_t *>(input),
                                  static_cast<uint64_t *>(output), base_log,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -10,7 +10,6 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

@@ -351,6 +350,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
      Torus state =
          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
      uint32_t offset = i * level_count * (lwe_dimension_out + 1);
+#pragma unroll 1
      for (int j = 0; j < level_count; j++) {

        KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -363,16 +363,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    lwe_acc_out[shmem_index] = local_lwe_out;
  }

-  if (tid <= lwe_dimension_out) {
-    for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
-         offset /= 2) {
-      __syncthreads();
+  for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
+    __syncthreads();
+    if (tid <= lwe_dimension_out && threadIdx.y < offset) {
      lwe_acc_out[shmem_index] +=
          lwe_acc_out[shmem_index + offset * blockDim.x];
    }
-    if (threadIdx.y == 0)
-      block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
  }
+  if (tid <= lwe_dimension_out && threadIdx.y == 0)
+    block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
 }

 template <typename Torus, typename KSTorus>
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -12,12 +12,9 @@
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
 #include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
 // Finish the keyswitching operation and prepare GLWEs for accumulation.
 // 1. Finish the keyswitching computation partially performed with a GEMM:
 //  - negate the dot product between the GLWE and KSK polynomial
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -6,7 +6,7 @@
 #include "helper_multi_gpu.h"
 #include "polynomial/parameters.cuh"
 #include "types/int128.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <limits>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,4 +1,5 @@
 #include "device.h"
+#include "utils/helper.cuh"
 #include <cstdint>
 #include <cuda_runtime.h>
 #include <mutex>
@@ -6,6 +7,27 @@
 #include <cuda_profiler_api.h>
 #endif

+void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+}
+
+int validate_device_ptr(const void *ptr) {
+  GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
+
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
+  if (attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer.")
+  }
+  return attr.device;
+}
+
 uint32_t cuda_get_device() {
  int device;
  check_cuda_error(cudaGetDevice(&device));
@@ -30,8 +52,9 @@ bool mem_pools_enabled = false;
 // better results.
 void cuda_setup_mempool(uint32_t caller_gpu_index) {
  if (!mem_pools_enabled) {
-    pool_mutex.lock();
-    if (mem_pools_enabled)
+    std::lock_guard lock(pool_mutex);
+    if (mem_pools_enabled) // double-check - mem_pools_enabled might have been
+                           // changed in a different thread
      return; // If mem pools are already enabled, we don't need to do anything

    // We do it only once for all GPUs
@@ -78,7 +101,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
    }
    // We return to the original gpu_index
    cuda_set_device(caller_gpu_index);
-    pool_mutex.unlock();
  }
 }

@@ -234,60 +256,61 @@ bool cuda_check_support_thread_block_clusters() {
 #endif
 }

-/// Copy memory to the GPU asynchronously
+/// Copy memory from the CPU to a GPU with size tracking.
+/// This copy is asynchronous only if the CPU memory was pinned, i.e.
+/// allocated using cudaMallocHost. This was shown to come with a performance
+/// penalty if we allocate all CPU data in this way in the backend, so
+/// cudaMallocHost is only used in specific places where we need an
+/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
+/// example to copy the bootstrapping key).
+/// The copy only happens if gpu_memory_allocated is true.
 void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 uint64_t size,
                                                 cudaStream_t stream,
                                                 uint32_t gpu_index,
                                                 bool gpu_memory_allocated) {
+
+  GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
+
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }

-/// Copy memory to the GPU asynchronously
+/// Copy memory from the CPU to a GPU.
+/// This copy is asynchronous only if the CPU memory was pinned, i.e.
+/// allocated using cudaMallocHost. This was shown to come with a performance
+/// penalty if we allocate all CPU data in this way in the backend, so
+/// cudaMallocHost is only used in specific places where we need an
+/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
+/// example to copy the bootstrapping key).
 void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
  cuda_memcpy_with_size_tracking_async_to_gpu(dest, src, size, stream,
                                              gpu_index, true);
 }

-/// Copy memory within a GPU asynchronously
+/// Copy memory within a GPU asynchronously.
+/// The copy only happens if gpu_memory_allocated is true
 void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    void *dest, void const *src, uint64_t size, cudaStream_t stream,
    uint32_t gpu_index, bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  GPU_ASSERT(dest != nullptr,
-             "Cuda error: trying to copy gpu->gpu to null ptr");
-  GPU_ASSERT(src != nullptr,
-             "Cuda error: trying to copy gpu->gpu from null ptr");

-  cudaPointerAttributes attr_dest;
-  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  PANIC_IF_FALSE(
-      attr_dest.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
-  cudaPointerAttributes attr_src;
-  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  PANIC_IF_FALSE(
-      attr_src.type == cudaMemoryTypeDevice,
-      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
+  int src_gpu_index = validate_device_ptr(src);
+  int dest_gpu_index = validate_device_ptr(dest);
  cuda_set_device(gpu_index);
-  if (attr_src.device == attr_dest.device) {
+  if (src_gpu_index == dest_gpu_index) {
    check_cuda_error(
        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
  } else {
-    check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
-                                         attr_src.device, size, stream));
+    check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
+                                         src_gpu_index, size, stream));
  }
 }
 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
@@ -327,21 +350,20 @@ void cuda_synchronize_device(uint32_t gpu_index) {
  check_cuda_error(cudaDeviceSynchronize());
 }

+/// cuda_memset sets bytes, we basically only use it to initialize data to 0
+/// The memset only happens if gpu_memory_allocated is true
 void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
                                          uint64_t size, cudaStream_t stream,
                                          uint32_t gpu_index,
                                          bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
-  }
+  validate_device_ptr_and_gpu_index(dest, gpu_index);
  cuda_set_device(gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

+/// cuda_memset sets bytes, we basically only use it to initialize data to 0
 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cudaStream_t stream, uint32_t gpu_index) {
  cuda_memset_with_size_tracking_async(dest, val, size, stream, gpu_index,
@@ -366,7 +388,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    }
    cuda_set_device(gpu_index);
    int block_size = 256;
-    int num_blocks = (n + block_size - 1) / block_size;
+    int num_blocks = CEIL_DIV(n, block_size);

    // Launch the kernel
    cuda_set_value_kernel<Torus>
@@ -384,15 +406,15 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                                   uint32_t n);

 /// Copy memory to the CPU asynchronously
+/// This comes with a big penalty on performance even if the CPU
+/// memory is pinned (using cudaMallocHost for the CPU allocation),
+/// so it should be avoided at all costs
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index) {
+  GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
  if (size == 0)
    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
-  }
+  validate_device_ptr_and_gpu_index(src, gpu_index);

  cuda_set_device(gpu_index);
  check_cuda_error(
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
@@ -68,9 +68,15 @@ struct alignas(16) f128 {
    auto t = two_sum(a.lo, b.lo);

    double hi = s.hi;
+#ifdef __CUDA_ARCH__
+    double lo = __dadd_rn(s.lo, t.hi);
+    hi = __dadd_rn(hi, lo);
+    lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
+#else
    double lo = s.lo + t.hi;
    hi = hi + lo;
    lo = lo - (hi - s.hi);
+#endif

    return f128(hi, lo + t.lo);
  }
@@ -104,8 +110,13 @@ struct alignas(16) f128 {
  __host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
    auto s = two_diff(a.hi, b.hi);
    auto t = two_diff(a.lo, b.lo);
+#ifdef __CUDA_ARCH__
+    s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
+    return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
+#else
    s = quick_two_sum(s.hi, s.lo + t.hi);
    return quick_two_sum(s.hi, s.lo + t.lo);
+#endif
  }

  // Multiplication
@@ -220,16 +231,16 @@ struct f128x2 {
  // Subtraction
  __host__ __device__ friend f128x2 operator-(const f128x2 &a,
                                              const f128x2 &b) {
-    return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
-                  f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
+    return f128x2(f128::sub_estimate(a.re, b.re),
+                  f128::sub_estimate(a.im, b.im));
  }

  // Multiplication (complex multiplication)
  __host__ __device__ friend f128x2 operator*(const f128x2 &a,
                                              const f128x2 &b) {
+    const f128 a_im_b_im = f128::mul(a.im, b.im);
    f128 real_part =
-        f128::add(f128::mul(a.re, b.re),
-                  f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
+        f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
    f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
    return f128x2(real_part, imag_part);
  }
@@ -243,8 +254,8 @@ struct f128x2 {

  // Subtraction-assignment operator
  __host__ __device__ f128x2 &operator-=(const f128x2 &other) {
-    re = f128::add(re, f128(-other.re.hi, -other.re.lo));
-    im = f128::add(im, f128(-other.im.hi, -other.im.lo));
+    re = f128::sub_estimate(re, other.re);
+    im = f128::sub_estimate(im, other.im);
    return *this;
  }

@@ -261,12 +272,20 @@ struct f128x2 {
 };

 __host__ __device__ inline uint64_t double_to_bits(double d) {
+#ifdef __CUDA_ARCH__
+  uint64_t bits = __double_as_longlong(d);
+#else
  uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
+#endif
  return bits;
 }

 __host__ __device__ inline double bits_to_double(uint64_t bits) {
+#ifdef __CUDA_ARCH__
+  double d = __longlong_as_double(bits);
+#else
  double d = *reinterpret_cast<double *>(&bits);
+#endif
  return d;
 }

@@ -275,6 +294,8 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
  const double A = ONE << 52;
  const double B = ONE << 104;
  const double C = ONE << 76;
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double D = 340282366920938500000000000000000000000.;

  const __uint128_t threshold = (ONE << 104);
@@ -288,15 +309,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = A_bits | lower64;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - A;

    uint64_t B_bits = double_to_bits(B);
    uint64_t top64 = static_cast<uint64_t>(x >> 52);
    uint64_t bits_h = B_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
+#else
+    double l = l_temp - A;
    double h = h_temp - B;

    return (l + h);
+#endif

  } else {
    uint64_t C_bits = double_to_bits(C);
@@ -310,15 +336,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {

    uint64_t bits_l = C_bits | lower64 | mask_part;
    double l_temp = bits_to_double(bits_l);
-    double l = l_temp - C;

    uint64_t D_bits = double_to_bits(D);
    uint64_t top64 = static_cast<uint64_t>(x >> 76);
    uint64_t bits_h = D_bits | top64;
    double h_temp = bits_to_double(bits_h);
+
+#ifdef __CUDA_ARCH__
+    return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
+#else
+    double l = l_temp - C;
    double h = h_temp - D;

    return (l + h);
+#endif
  }
 }

@@ -389,6 +420,8 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {

 __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  auto x = f128::sub_estimate(a, f128::f128_floor(a));
+  // NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
+  // does not work here
  const double normalization = 340282366920938500000000000000000000000.;
 #ifdef __CUDA_ARCH__
  x.hi = __dmul_rn(x.hi, normalization);
@@ -398,7 +431,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
  x.lo *= normalization;
 #endif

-  // TODO has to be round
+  x = f128::add_estimate(x, f128(0.5, 0.0));
  x = f128::f128_floor(x);

  __uint128_t x0 = f64_to_u128(x.hi);
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -12,8 +12,9 @@
 using Index = unsigned;

 #define NEG_TWID(i)                                                            \
-  f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]),               \
-         f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
+  f128x2(                                                                      \
+      f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])),  \
+      f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))

 #define F64x4_TO_F128x2(f128x2_reg, ind)                                       \
  f128x2_reg.re.hi = dt_re_hi[ind];                                            \
@@ -75,7 +76,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }
      tid = tid + STRIDE;
    }
    __syncthreads();
@@ -86,8 +91,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);
-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }
      w = NEG_TWID(tid / lane_mask + twiddle_shift);
      f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
      f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
@@ -151,7 +159,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      // keep one of the register for next iteration and store another one in sm
      Index rank = tid & thread_mask;
      bool u_stays_in_register = rank < lane_mask;
-      F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
+      if (u_stays_in_register) {
+        F128x2_TO_F64x4(v[i], tid);
+      } else {
+        F128x2_TO_F64x4(u[i], tid);
+      }

      tid = tid + STRIDE;
    }
@@ -165,8 +177,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
      bool u_stays_in_register = rank < lane_mask;
      F64x4_TO_F128x2(w, tid ^ lane_mask);

-      u[i] = (u_stays_in_register) ? u[i] : w;
-      v[i] = (u_stays_in_register) ? w : v[i];
+      if (u_stays_in_register) {
+        v[i] = w;
+      } else {
+        u[i] = w;
+      }

      tid = tid + STRIDE;
    }
@@ -218,7 +233,7 @@ __device__ void convert_u128_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re, const __uint128_t *in_im) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
@@ -241,7 +256,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {

-  const double normalization = pow(2., -128.);
+  const double normalization = __longlong_as_double(0x37f0000000000000ULL);
  Index tid = threadIdx.x;
  // #pragma unroll
  for (Index i = 0; i < params::opt / 2; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -12,7 +12,7 @@
 #include "integer/subtraction.cuh"
 #include "pbs/programmable_bootstrap_classic.cuh"
 #include "pbs/programmable_bootstrap_multibit.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"

 // lwe_dimension + 1 threads
 // todo: This kernel MUST be refactored to a binary reduction
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(

  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -222,7 +222,7 @@ __host__ void is_at_least_one_comparisons_block_true(
  uint32_t remaining_blocks = num_radix_blocks;
  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    int num_chunks = CEIL_DIV(remaining_blocks, max_value);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -10,7 +10,122 @@
 #include "integer/integer.cuh"
 #include "linearalgebra/multiplication.cuh"
 #include "polynomial/functions.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
+
+/*
+ * =============================================================================
+ * GPU Compression/Decompression Algorithm: Overview
+ * =============================================================================
+ *
+ * The compression algorithm transforms standard LWE ciphertexts into a compact
+ * packed format. Decompression reverses this process.
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION INPUT (lwe_array_in)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                    lwe_array_in (GPU memory)                            |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------+---------------------------+-----------------+
+ *  |          LWE 0            |          LWE 1            |      ...        |
+ *  |      [mask, body]         |      [mask, body]         |                 |
+ *  +---------------------------+---------------------------+-----------------+
+ *  |<-- lwe_dimension + 1 -->|
+ *
+ *  Total LWEs: total_lwe_bodies_count (num_radix_blocks)
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION PROCESS
+ * -----------------------------------------------------------------------------
+ *
+ * 1. Message Shift (64-bit only):
+ *    Each LWE is multiplied by message_modulus to shift the message to MSB
+ *
+ * 2. Packing Keyswitch (LWE -> GLWE):
+ *    Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
+ *
+ *    +--------------------------------------------------------------+
+ *    |   lwe_per_glwe LWEs (input batch)                            |
+ *    |   LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1]                   |
+ *    +--------------------------------------------------------------+
+ *                              |
+ *                    Packing Keyswitch
+ *                              v
+ *    +--------------------------------------------------------------+
+ *    |            Single GLWE Ciphertext                            |
+ *    |   [A_0, A_1, ..., A_{k-1}, B]                                |
+ *    |   |<-- k * polynomial_size -->| |<-- polynomial_size -->|   |
+ *    +--------------------------------------------------------------+
+ *
+ *    Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
+ *                                             lwe_per_glwe)
+ *
+ * 3. Modulus Switch:
+ *    Reduce precision from 64-bit torus to storage_log_modulus bits
+ *
+ * 4. Bit Packing:
+ *    Pack multiple reduced-precision elements into dense bit representation
+ *
+ * -----------------------------------------------------------------------------
+ * COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |                 tmp_glwe_array_out (intermediate buffer)                |
+ *  +-------------------------------------------------------------------------+
+ *  +----------------------------+----------------------------+---------------+
+ *  |         GLWE 0             |         GLWE 1             |    ...        |
+ *  |  [A_0..A_{k-1}, B_0..B_N]  |  [A_0..A_{k-1}, B_0..B_N]  |               |
+ *  +----------------------------+----------------------------+---------------+
+ *       |<-- glwe_accumulator_size = (k+1)*N -->|
+ *
+ *  Total size needed: num_glwes * glwe_accumulator_size elements
+ *  Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
+ *
+ * -----------------------------------------------------------------------------
+ * PACKED OUTPUT (glwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |              Packed GLWE Ciphertext List (bit-packed)                   |
+ *  +-------------------------------------------------------------------------+
+ *  +-------------------------------------------------------------------------+
+ *  |  Elements packed with storage_log_modulus bits per original element    |
+ *  |  Total packed size: ceil(in_len * storage_log_modulus / 64) elements   |
+ *  +-------------------------------------------------------------------------+
+ *
+ * =============================================================================
+ * DECOMPRESSION (Extract) Algorithm
+ * =============================================================================
+ *
+ * Decompression receives an array of LWE indexes. For each index, it identifies
+ * the corresponding GLWE, extracts that GLWE from the packed representation,
+ * and then sample-extracts the requested LWE from the GLWE.
+ *
+ * -----------------------------------------------------------------------------
+ * EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
+ * -----------------------------------------------------------------------------
+ *
+ *  +-------------------------------------------------------------------------+
+ *  |               Extracted GLWE Ciphertext                                 |
+ *  +-------------------------------------------------------------------------+
+ *  +---------------------------------------+-----------------+---------------+
+ *  |    Mask (A polynomials)               |   Body (B)      |    Tail       |
+ *  |    [A_0, ..., A_{k-1}]                |   (body_count)  |   (zeroed)    |
+ *  |    k * polynomial_size elements       |   elements      |   elements    |
+ *  +---------------------------------------+-----------------+---------------+
+ *  |<------------------- initial_out_len ------------------->|               |
+ *  |<------------------------ glwe_ciphertext_size ------------------------->|
+ *
+ *  For the last GLWE, body_count may be less than polynomial_size (partial).
+ *  The tail region must be zeroed to ensure defined behavior.
+ *
+ *  tail_size = glwe_ciphertext_size - initial_out_len
+ *  tail_offset = initial_out_len  (NOT 0!)
+ *
+ * =============================================================================
+ */

 template <typename Torus>
 __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
@@ -66,7 +181,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,

  // number_bits_to_pack.div_ceil(Scalar::BITS)
  auto nbits = sizeof(Torus) * 8;
-  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
+  auto out_len = CEIL_DIV(number_bits_to_pack, nbits);

  int num_blocks = 0, num_threads = 0;
  getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
@@ -108,6 +223,8 @@ host_integer_compress(CudaStreams streams,
  uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
                        glwe_array_out->lwe_per_glwe - 1) /
                       glwe_array_out->lwe_per_glwe;
+  PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
+                 "Invalid number of GLWEs");

  // Keyswitch LWEs to GLWE
  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -200,8 +317,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;

-  uint32_t num_glwes =
-      (total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
+  uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);

  // Compressed length of the compressed GLWE we want to extract
  uint32_t body_count = 0;
@@ -218,19 +334,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,

  uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;

-  // Calculates how many bits this particular GLWE shall use
-  auto number_bits_to_unpack = initial_out_len * log_modulus;
  auto nbits = sizeof(Torus) * 8;

-  // Calculates how many bits a full-packed GLWE shall use
-  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
-  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
+  // Calculate how many bits a full-packed GLWE uses, to determine
+  // the stride between consecutive packed GLWEs in the input buffer
+  auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
+  auto len = CEIL_DIV(number_bits_to_unpack, nbits);
  // Uses that length to set the input pointer
  auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;

  // Ensure the tail of the GLWE is zeroed
+  // The extract kernel writes initial_out_len elements starting at offset 0.
+  // We must zero the tail region (from initial_out_len to
+  // glwe_ciphertext_size)
  if (initial_out_len < glwe_ciphertext_size) {
-    cuda_memset_async(glwe_array_out, 0,
+    cuda_memset_async(glwe_array_out + initial_out_len, 0,
                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
                      stream, gpu_index);
  }
@@ -368,7 +486,7 @@ host_integer_decompress(CudaStreams streams,
      /// gather data to GPU 0 we can copy back to the original indexing
      multi_gpu_scatter_lwe_async<Torus>(
          active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
-          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
          lut->active_streams.count(), num_blocks_to_decompress,
          compression_params.small_lwe_dimension + 1);

@@ -388,7 +506,7 @@ host_integer_decompress(CudaStreams streams,
      multi_gpu_gather_lwe_async<Torus>(
          active_streams, (Torus *)d_lwe_array_out->ptr, lwe_after_pbs_vec,
          lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-          lut->lwe_aligned_vec, num_blocks_to_decompress,
+          lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_decompress,
          encryption_params.big_lwe_dimension + 1);

      /// Synchronize all GPUs
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -14,7 +14,6 @@
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include <algorithm>
 #include <functional>

@@ -273,8 +272,7 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
      }
    }

-    if ((blockIdx.x / group_size + 1) <
-        (blocks_count + group_size - 1) / group_size) {
+    if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
      size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
      size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
      for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
@@ -363,7 +361,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
      num_radix_blocks > src1->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks should have more "
          "blocks than the number used in sum in groups")
-  auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
+  auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
  if (src2->num_radix_blocks < num_groups)
    PANIC("Cuda error: second input in sum in groups should have at least "
          "num_groups blocks")
@@ -570,8 +568,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -594,7 +592,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)lwe_array_out->ptr, lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
+        big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
        active_streams);
@@ -674,8 +673,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -791,8 +790,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
-        big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
+        num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
    /// Apply KS to go from a big LWE dimension to a small LWE dimension
    execute_keyswitch_async<Torus>(
@@ -815,7 +814,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)(lwe_array_out->ptr), lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
+        lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
+        big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
        active_streams);
@@ -2342,7 +2342,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,

  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
  // from the output ct
-  auto active_streams = streams.active_gpu_subset(
+  auto active_streams = streams.active_gpu_subset_u128(
      lwe_array_out->num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<InputTorus>(
@@ -2373,8 +2373,9 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
    multi_gpu_scatter_lwe_async<InputTorus>(
        active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr,
        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_scatter_vec, lut->active_streams.count(),
-        lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
+        lut->lwe_aligned_scatter_vec, lut->event_pool,
+        lut->active_streams.count(), lwe_array_out->num_radix_blocks,
+        lut->input_big_lwe_dimension + 1);

    execute_keyswitch_async<InputTorus>(
        active_streams, lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -2397,7 +2398,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
    multi_gpu_gather_lwe_async<__uint128_t>(
        active_streams, (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
        nullptr, lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
-        lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
+        lut->event_pool, lwe_array_out->num_radix_blocks,
+        big_lwe_dimension + 1);

    /// Synchronize all GPUs
    streams.synchronize();
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -362,7 +362,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
  radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
                                num_radix_in_vec, chunk_size, needs_processing);
  int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
-  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
+  int part_count = CEIL_DIV(big_lwe_size, number_of_threads);
  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

  mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -10,7 +10,7 @@
 #include "device.h"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <iostream>
 #include <sstream>
 #include <string>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -54,7 +54,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
-        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
+        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
        active_streams.count(), num_blocks_to_process,
        mem_ptr->params.small_lwe_dimension + 1);
    POP_RANGE()
@@ -72,7 +72,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
-        lut->lwe_aligned_vec, num_blocks_to_process,
+        lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_process,
        mem_ptr->params.big_lwe_dimension + 1);
    POP_RANGE()
    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -4,8 +4,8 @@
 #include "device.h"
 #include "integer/integer.h"
 #include "integer/radix_ciphertext.h"
+#include "utils/helper.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"

 inline CudaLweCiphertextListFFI
 to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/rerand.cuh
@@ -5,8 +5,8 @@
 #include "integer/radix_ciphertext.h"
 #include "integer/rerand.h"
 #include "integer/rerand_utilities.h"
+#include "utils/helper.cuh"
 #include "utils/helper_profile.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include "zk/zk_utilities.h"

 template <typename Torus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -8,7 +8,7 @@

 #include "device.h"
 #include "helper_multi_gpu.h"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <stdio.h>

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -12,7 +12,7 @@
 #include "integer/scalar_mul.h"
 #include "multiplication.cuh"
 #include "scalar_shifts.cuh"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <stdio.h>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -116,7 +116,7 @@ __host__ void host_integer_overflowing_sub(
      radix_params.message_modulus * radix_params.carry_modulus;
  uint32_t num_bits_in_block = log2_int(block_modulus);
  uint32_t grouping_size = num_bits_in_block;
-  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
+  uint32_t num_groups = CEIL_DIV(num_blocks, grouping_size);

  host_unchecked_sub_with_correcting_term<Torus>(
      streams.stream(0), streams.gpu_index(0), output, input_left, input_right,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/vector_find.cuh
@@ -175,14 +175,17 @@ __host__ void host_aggregate_one_hot_vector(
    Torus *const *ksks) {

  int_radix_params params = mem_ptr->params;
+  if (params.message_modulus > 4 && params.carry_modulus > 4) {
+    PANIC("Cuda error: aggregate one hot vector is only implemented for 1_1 "
+          "and 2_2 params");
+  }
  uint32_t chunk_size = mem_ptr->chunk_size;
  uint32_t num_streams = mem_ptr->num_streams;

  mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
      streams);

-  uint32_t inputs_per_stream =
-      (num_input_ciphertexts + num_streams - 1) / num_streams;
+  uint32_t inputs_per_stream = CEIL_DIV(num_input_ciphertexts, num_streams);

  for (uint32_t s = 0; s < num_streams; s++) {

@@ -210,7 +213,7 @@ __host__ void host_aggregate_one_hot_vector(
    if (count_in_stream == 0)
      continue;

-    uint32_t num_chunks = (count_in_stream + chunk_size - 1) / chunk_size;
+    uint32_t num_chunks = CEIL_DIV(count_in_stream, chunk_size);

    //
    // Process chunks of input ciphertexts for the current stream
@@ -255,7 +258,10 @@ __host__ void host_aggregate_one_hot_vector(

  //
  // Aggregate partial results from all streams into the final aggregated vector
+  // num_streams has to be less than the max noise level otherwise we accumulate
+  // too much and the noise limit is exceeded
  //
+  CHECK_NOISE_LEVEL(num_streams, params.message_modulus, params.carry_modulus);
  for (uint32_t s = 1; s < num_streams; s++) {
    uint32_t start_idx = s * inputs_per_stream;
    if (start_idx >= num_input_ciphertexts)
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -8,7 +8,7 @@
 #include "helper_multi_gpu.h"
 #include "integer/integer.h"
 #include "integer/integer_utilities.h"
-#include "utils/kernel_dimensions.cuh"
+#include "utils/helper.cuh"
 #include <stdio.h>

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -6,7 +6,7 @@
 #include <cuda_runtime.h>
 #endif

-#include "../utils/kernel_dimensions.cuh"
+#include "../utils/helper.cuh"
 #include "device.h"
 #include "linear_algebra.h"
 #include <fstream>
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -6,7 +6,7 @@
 #include <cuda_runtime.h>
 #endif

-#include "../utils/kernel_dimensions.cuh"
+#include "../utils/helper.cuh"
 #include "device.h"
 #include "linear_algebra.h"

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -211,6 +211,8 @@ __global__ void device_programmable_bootstrap_amortized(
  // For the mask it's more complicated
  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
                                     glwe_dimension);
+
+  // No need to sync here, it is already synchronized after add_to_torus
  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
                                     glwe_dimension);
 }
@@ -303,7 +305,9 @@ __host__ void host_programmable_bootstrap_amortized(
    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap amortized only supports 64-bit "
+                 "Torus type.");
  uint64_t SM_FULL =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size, glwe_dimension);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -173,6 +173,7 @@ __global__ void device_programmable_bootstrap_cg(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -184,7 +185,8 @@ __global__ void device_programmable_bootstrap_cg(
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-
+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -247,7 +249,9 @@ __host__ void host_programmable_bootstrap_cg(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    uint32_t num_many_lut, uint32_t lut_stride) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap cg only supports 64-bit "
+                 "Torus type.");
  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
  uint64_t full_sm =
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -151,9 +151,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }

      } else if (blockIdx.y == glwe_dimension) {
-
+        __syncthreads();
        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {

@@ -165,6 +164,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];

+            // No need to sync, it is already synchronized before the first
+            // sample_extract_body call
            sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                               accumulator, 0, i * lut_stride);
          }
@@ -302,7 +303,10 @@ __host__ void execute_cg_external_product_loop(
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit cg only supports 64-bit "
+      "Torus type.");
  uint64_t full_sm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -239,6 +239,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -251,6 +252,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -409,7 +412,9 @@ __host__ void execute_step_one(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    PBS_MS_REDUCTION_T noise_reduction_type) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap step one only supports 64-bit "
+                 "Torus type.");
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
@@ -451,7 +456,9 @@ __host__ void execute_step_two(
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
    uint32_t num_many_lut, uint32_t lut_stride) {
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap step two only supports 64-bit "
+                 "Torus type.");
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
  int thds = polynomial_size / params::opt;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -44,30 +44,31 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
@@ -91,30 +92,31 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
+    host_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -240,10 +240,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // in case they're not synchronized
      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
    }
  } else {
-    // Persist the updated accumulator
+    // We don't sync here because we use same indexes to read from `accumulator`
+    // as it was used in `add_to_torus_128` to write inside it Persist the
+    // updated accumulator
    tid = threadIdx.x;
    for (int i = 0; i < params::opt; i++) {
      global_slice[tid] = accumulator[tid];
@@ -395,6 +398,7 @@ __global__ void device_programmable_bootstrap_cg_128(
                                               accumulator);

    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
                                               0);
    }
@@ -571,38 +575,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
          input_lwe_ciphertext_count, max_shared_memory)) {
    switch (polynomial_size) {
    case 256:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<256>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 512:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<512>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 1024:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<1024>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 2048:
-      return scratch_programmable_bootstrap_cg_128<InputTorus,
-                                                   AmortizedDegree<2048>>(
+      return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 4096:
+      // We use AmortizedDegree for 4096 to avoid register exhaustion
      return scratch_programmable_bootstrap_cg_128<InputTorus,
                                                   AmortizedDegree<4096>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
@@ -618,38 +619,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
  } else {
    switch (polynomial_size) {
    case 256:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<256>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<256>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 512:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<512>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<512>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 1024:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<1024>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<1024>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 2048:
-      return scratch_programmable_bootstrap_128<InputTorus,
-                                                AmortizedDegree<2048>>(
+      return scratch_programmable_bootstrap_128<InputTorus, Degree<2048>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
          glwe_dimension, polynomial_size, level_count,
          input_lwe_ciphertext_count, allocate_gpu_memory,
          noise_reduction_type);
      break;
    case 4096:
+      // We use AmortizedDegree for 4096 to avoid register exhaustion
      return scratch_programmable_bootstrap_128<InputTorus,
                                                AmortizedDegree<4096>>(
          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
@@ -956,22 +954,19 @@ __host__ bool supports_cooperative_groups_on_programmable_bootstrap_128(
    uint32_t max_shared_memory) {
  switch (polynomial_size) {
  case 256:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                              max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<256>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 512:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                              max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<512>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 1024:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                               max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<1024>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 2048:
-    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
-        AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                               max_shared_memory);
+    return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<2048>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    return verify_cuda_programmable_bootstrap_128_cg_grid_size<
        AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
                               max_shared_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -461,6 +461,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -473,6 +474,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync here, it is already synchronized after add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             global_slice, 0, i * lut_stride);
        }
@@ -663,7 +665,9 @@ __host__ void execute_compute_keybundle(
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: PBS keybundle only supports 64-bit "
+                 "Torus type.");
  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint64_t chunk_size = std::min(
      lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
@@ -735,7 +739,10 @@ __host__ void execute_step_one(
    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit step one only supports 64-bit "
+      "Torus type.");
  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
          polynomial_size);
@@ -789,7 +796,10 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
                 uint32_t level_count, uint32_t j, uint32_t num_many_lut,
                 uint32_t lut_stride) {
  cuda_set_device(gpu_index);
-
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit step two only supports 64-bit "
+      "Torus type.");
  uint32_t lwe_chunk_size = (uint32_t)(buffer->lwe_chunk_size);
  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -11,29 +11,30 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
  switch (polynomial_size) {
  case 256:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<256>>(
+                                                        Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 512:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<512>>(
+                                                        Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 1024:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<1024>>(
+                                                        Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 2048:
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<2048>>(
+                                                        Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
                                                        AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
@@ -55,30 +56,31 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(

  switch (polynomial_size) {
  case 256:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<256>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 512:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<512>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 1024:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<1024>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 2048:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<2048>>(
+    return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                           Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
        allocate_gpu_memory);
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    return scratch_cg_multi_bit_programmable_bootstrap_128<
        InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
@@ -97,7 +99,7 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  bool supports_cg =
-      supports_cooperative_groups_on_multibit_programmable_bootstrap<
+      supports_cooperative_groups_on_multibit_programmable_bootstrap_128<
          __uint128_t>(glwe_dimension, polynomial_size, level_count,
                       input_lwe_ciphertext_count,
                       cuda_get_max_shared_memory(gpu_index));
@@ -129,7 +131,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -137,7 +139,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -145,8 +147,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_multi_bit_programmable_bootstrap_128<InputTorus,
-                                              AmortizedDegree<1024>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -154,8 +155,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 2048:
-    host_multi_bit_programmable_bootstrap_128<InputTorus,
-                                              AmortizedDegree<2048>>(
+    host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -163,6 +163,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_multi_bit_programmable_bootstrap_128<InputTorus,
                                              AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
@@ -191,8 +192,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(

  switch (polynomial_size) {
  case 256:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<256>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -200,8 +200,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<512>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -209,8 +208,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<1024>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -218,8 +216,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 2048:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<2048>>(
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -227,6 +224,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
        num_many_lut, lut_stride);
    break;
  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
                                                 AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
@@ -333,6 +333,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<__uint128_t, params>(block_lwe_array_out,
                                               global_slice, 0);
      if (num_many_lut > 1) {
@@ -346,6 +347,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<__uint128_t, params>(
              next_block_lwe_array_out, global_slice, 0, i * lut_stride);
        }
@@ -505,10 +508,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }

      } else if (blockIdx.y == glwe_dimension) {
-
+        __syncthreads();
        sample_extract_body<__uint128_t, params>(block_lwe_array_out,
                                                 accumulator, 0);
-
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {

@@ -519,7 +521,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];
-
+            // No need to sync, it is already synchronized before the first
+            // sample_extract_body call
            sample_extract_body<__uint128_t, params>(
                next_block_lwe_array_out, accumulator, 0, i * lut_stride);
          }
@@ -1090,4 +1093,109 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128(
  return size_tracker;
 }

+// Verify if the grid size satisfies the cooperative group constraints
+template <typename Torus, class params>
+__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
+    int glwe_dimension, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+
+  // If Cooperative Groups is not supported, no need to check anything else
+  if (!cuda_check_support_cooperative_groups())
+    return false;
+
+  // Calculate the dimension of the kernel
+  uint64_t full_sm_cg_accumulate =
+      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
+          params::degree);
+  uint64_t partial_sm_cg_accumulate =
+      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
+          params::degree);
+
+  int thds = params::degree / params::opt;
+
+  // Get the maximum number of active blocks per streaming multiprocessors
+  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
+  int max_active_blocks_per_sm;
+
+  if (max_shared_memory < partial_sm_cg_accumulate) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            Torus, params, NOSM>,
+        thds, 0);
+  } else if (max_shared_memory < full_sm_cg_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  PARTIALSM>,
+        cudaFuncCachePreferShared);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            Torus, params, PARTIALSM>,
+        thds, partial_sm_cg_accumulate);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
+                                                                  FULLSM>,
+        cudaFuncCachePreferShared);
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
+            Torus, params, FULLSM>,
+        thds, full_sm_cg_accumulate);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  // Get the number of streaming multiprocessors
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
+}
+
+// Verify if the grid size for the multi-bit kernel satisfies the cooperative
+// group constraints
+template <typename Torus>
+__host__ bool
+supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+  switch (polynomial_size) {
+  case 256:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<256>>(glwe_dimension, level_count, num_samples,
+                            max_shared_memory);
+  case 512:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<512>>(glwe_dimension, level_count, num_samples,
+                            max_shared_memory);
+  case 1024:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<1024>>(glwe_dimension, level_count, num_samples,
+                             max_shared_memory);
+  case 2048:
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, Degree<2048>>(glwe_dimension, level_count, num_samples,
+                             max_shared_memory);
+  case 4096:
+    // We use AmortizedDegree for 4096 to avoid register exhaustion
+    return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  default:
+    PANIC(
+        "Cuda error (multi-bit PBS128): unsupported polynomial size. Supported "
+        "N's are powers of two"
+        " in the interval [256..4096].")
+  }
+}
+
 #endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -179,6 +179,7 @@ __global__ void device_programmable_bootstrap_tbc(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      __syncthreads();
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

      if (num_many_lut > 1) {
@@ -191,13 +192,17 @@ __global__ void device_programmable_bootstrap_tbc(
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-
+          // No need to sync, it is already synchronized before the first
+          // sample_extract_body call
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
      }
    }
  }
+  // Before exiting the kernel we need to sync the cluster to ensure that
+  // other blocks can still access the dsm in the mul ggsw glwe
+  cluster.sync();
 }

 template <typename Torus, class params, sharedMemDegree SMD>
@@ -207,8 +212,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
    const Torus *__restrict__ lut_vector_indexes,
    const Torus *__restrict__ lwe_array_in,
    const Torus *__restrict__ lwe_input_indexes,
-    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
-    uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
+    const double2 *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
+    uint32_t num_many_lut, uint32_t lut_stride,
    PBS_MS_REDUCTION_T noise_reduction_type) {

  constexpr uint32_t level_count = 1;
@@ -254,9 +259,6 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
                  (glwe_dimension + 1)];

-  double2 *block_join_buffer =
-      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
-                   params::degree / 2];
  // Since the space is L1 cache is small, we use the same memory location for
  // the rotated accumulator and the fft accumulator, since we know that the
  // rotated array is not in use anymore by the time we perform the fft
@@ -279,13 +281,14 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
  Torus temp_a_hat = 0;
  for (int i = 0; i < lwe_dimension; i++) {

+    constexpr int WARP_SIZE = 32;
    // We calculate the modulus switch of a warp size of elements
-    if (i % 32 == 0 && (i + threadIdx.x % 32) < lwe_dimension) {
-      modulus_switch(block_lwe_array_in[i + threadIdx.x % 32], temp_a_hat,
-                     log_modulus);
+    if (i % WARP_SIZE == 0 && (i + threadIdx.x % WARP_SIZE) < lwe_dimension) {
+      modulus_switch(block_lwe_array_in[i + threadIdx.x % WARP_SIZE],
+                     temp_a_hat, log_modulus);
    }
    // each iteration we broadcast the corresponding ms previously calculated
-    Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % 32);
+    Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % WARP_SIZE);

    __syncthreads();
    Torus reg_acc_rotated[params::opt];
@@ -360,6 +363,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);

      if (num_many_lut > 1) {
@@ -373,12 +377,16 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];

+          // No need to sync here, it is already synchronized after add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
      }
    }
  }
+  // Before exiting the kernel we need to sync the cluster to ensure that
+  // other blocks can still access the dsm in the mul ggsw glwe or the
+  // ping pong buffers
  cluster.sync();
 }

@@ -458,6 +466,9 @@ __host__ void host_programmable_bootstrap_tbc(
    uint32_t num_many_lut, uint32_t lut_stride) {
  cuda_set_device(gpu_index);

+  PANIC_IF_FALSE(sizeof(Torus) == 8,
+                 "Error: Programmable bootstrap tbc only supports 64-bit "
+                 "Torus type.");
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  auto supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
@@ -543,8 +554,8 @@ __host__ void host_programmable_bootstrap_tbc(
          &config,
          device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
          lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
-          lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
-          lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
+          lwe_array_in, lwe_input_indexes, bootstrapping_key, lwe_dimension,
+          num_many_lut, lut_stride, noise_reduction_type));
    } else {
      config.dynamicSmemBytes = full_sm + minimum_sm_tbc;

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -156,6 +156,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
          }
        }
      } else if (blockIdx.y == glwe_dimension) {
+        __syncthreads();
        sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
        if (num_many_lut > 1) {
          for (int i = 1; i < num_many_lut; i++) {
@@ -167,7 +168,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                        (glwe_dimension * polynomial_size + 1) +
                                    blockIdx.y * polynomial_size];
-
+            // No need to sync, it is already synchronized before the first
+            // sample_extract_body call
            sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                               accumulator, 0, i * lut_stride);
          }
@@ -179,6 +181,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
          accumulator, global_accumulator_slice);
    }
  }
+  // Before exiting the kernel we need to sync the cluster to ensure that
+  // other blocks can still access the dsm in the mul ggsw glwe
+  cluster.sync();
 }

 // Specialized version for the multi-bit bootstrap using 2_2 params:
@@ -356,6 +361,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        }
      }
    } else if (blockIdx.y == glwe_dimension) {
+      // No need to sync here, it is already synchronized after add_to_torus
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
      if (num_many_lut > 1) {
        for (int i = 1; i < num_many_lut; i++) {
@@ -367,7 +373,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
              &next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
                                      (glwe_dimension * polynomial_size + 1) +
                                  blockIdx.y * polynomial_size];
-
+          // No need to sync here, it is already synchronized after
+          // add_to_torus
          sample_extract_body<Torus, params>(next_block_lwe_array_out,
                                             accumulator, 0, i * lut_stride);
        }
@@ -379,7 +386,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        reg_acc_rotated, global_accumulator_slice);
  }
  // Before exiting the kernel we need to sync the cluster to ensure that
-  //  that other blocks can still access the dsm in the ping pong buffer
+  // other blocks can still access the dsm in the ping pong buffer
  cluster.sync();
 }

@@ -523,6 +530,11 @@ __host__ void execute_tbc_external_product_loop(
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
    uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
    uint32_t lut_stride) {
+
+  PANIC_IF_FALSE(
+      sizeof(Torus) == 8,
+      "Error: Programmable bootstrap multi-bit tbc only supports 64-bit "
+      "Torus type.");
  cuda_set_device(gpu_index);

  auto lwe_chunk_size = buffer->lwe_chunk_size;
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/dot_product.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/dot_product.cuh
@@ -3,8 +3,6 @@

 #include "crypto/torus.cuh"

-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
 #define CIRCULANT_BLOCKTILE 32
 // Make a circulant matrix that serves to multiply a polynomial
 // with another one. Each thread loads a part of the original
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -339,8 +339,10 @@ template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
                                    uint32_t glwe_dimension, uint32_t nth = 0) {
  // Set first coefficient of the glwe as the body of the LWE sample
-  lwe_array_out[glwe_dimension * params::degree] =
-      glwe[glwe_dimension * params::degree + nth];
+  if (threadIdx.x == 0) {
+    lwe_array_out[glwe_dimension * params::degree] =
+        glwe[glwe_dimension * params::degree + nth];
+  }
 }

 // Extracts the mask from the nth-LWE in a GLWE.
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -8,8 +8,6 @@
 #include "parameters.cuh"
 #include "types/complex/operations.cuh"

-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
 template <typename T>
 __device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
  int pos = chunk_num * chunk_size;
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -1,135 +1,34 @@
 #ifndef HELPER_CUH
 #define HELPER_CUH

-#include <cstdint>
-#include <sstream>
-#include <stdio.h>
-#include <type_traits>
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))

-template <typename T> __device__ inline const char *get_format();
-
-template <> __device__ inline const char *get_format<int>() { return "%d, "; }
-
-template <> __device__ inline const char *get_format<unsigned int>() {
-  return "%u, ";
+inline int nextPow2(int x) {
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
 }

-template <> __device__ inline const char *get_format<uint64_t>() {
-  return "%lu, ";
+inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
+                                   int &blocks, int &threads) {
+  threads =
+      (n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
+  blocks = CEIL_DIV(n, threads);
 }

-template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
-  for (int i = 0; i < N; i++) {
-    printf(get_format<T>(), src[i]);
-  }
+// Determines blocks and threads in x for a given blockDim.y using the same
+// logic than above
+inline void getNumBlocksAndThreads2D(const int n, const int maxBlockSize,
+                                     const int block_dim_y, int &blocks,
+                                     int &threads_x) {
+  const int max_block_dim_x = maxBlockSize / block_dim_y;
+  threads_x = (n < max_block_dim_x * 2) ? max(128, nextPow2((n + 1) / 2))
+                                        : max_block_dim_x;
+  blocks = CEIL_DIV(n, threads_x);
 }

-template <>
-__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
-  for (int i = 0; i < N; i++) {
-    uint64_t low = static_cast<uint64_t>(src[i]);
-    uint64_t high = static_cast<uint64_t>(src[i] >> 64);
-    printf("(%llu, %llu), ", high, low);
-  }
-}
-
-template <>
-__global__ inline void print_debug_kernel(const double2 *src, int N) {
-  for (int i = 0; i < N; i++) {
-    printf("(%lf, %lf), ", src[i].x, src[i].y);
-  }
-}
-template <typename T> void print_debug(const char *name, const T *src, int N) {
-  printf("%s: ", name);
-  cudaDeviceSynchronize();
-  print_debug_kernel<<<1, 1>>>(src, N);
-  cudaDeviceSynchronize();
-  printf("\n");
-}
-
-template <typename T>
-__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
-  for (int i = 0; i < N; i++) {
-    T body = src[i * (lwe_dimension + 1) + lwe_dimension];
-    T clear = body / delta;
-    printf("(%lu, %lu), ", body, clear);
-  }
-}
-
-template <typename T>
-void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
-  printf("%s: ", name);
-  cudaDeviceSynchronize();
-  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
-  cudaDeviceSynchronize();
-  printf("\n");
-}
-
-template <typename Torus>
-void print_2d_csv_to_file(const std::vector<Torus> &v, int col_size,
-                          const char *fname) {
-  FILE *fp = fopen(fname, "wt");
-  for (int i = 0; i < v.size() / col_size; ++i) {
-    for (int j = 0; j < col_size; ++j) {
-      fprintf(fp, "%lu%c", v[i * col_size + j],
-              (j == col_size - 1) ? '\n' : ',');
-    }
-  }
-  fclose(fp);
-}
-
-template <typename Torus>
-__host__ void dump_2d_gpu_to_file(const Torus *ptr, int row_size, int col_size,
-                                  const char *fname_prefix, int rand_prefix,
-                                  cudaStream_t stream, uint32_t gpu_index) {
-  // #ifndef NDEBUG
-  std::vector<Torus> buf_cpu(row_size * col_size);
-
-  char fname[4096];
-  snprintf(fname, 4096, "%s_%d_%d_%d.csv", fname_prefix, row_size, col_size,
-           rand_prefix);
-
-  cuda_memcpy_async_to_cpu((void *)&buf_cpu[0], ptr,
-                           buf_cpu.size() * sizeof(Torus), stream, gpu_index);
-  cuda_synchronize_device(gpu_index);
-  print_2d_csv_to_file(buf_cpu, col_size, fname);
-  // #endif
-}
-
-template <typename Torus>
-__host__ void compare_2d_arrays(const Torus *ptr1, const Torus *ptr2,
-                                int row_size, int col_size, cudaStream_t stream,
-                                uint32_t gpu_index) {
-  // #ifndef NDEBUG
-  std::vector<Torus> buf_cpu1(row_size * col_size),
-      buf_cpu2(row_size * col_size);
-  ;
-  cuda_memcpy_async_to_cpu((void *)&buf_cpu1[0], ptr1,
-                           buf_cpu1.size() * sizeof(Torus), stream, gpu_index);
-  cuda_memcpy_async_to_cpu((void *)&buf_cpu2[0], ptr2,
-                           buf_cpu2.size() * sizeof(Torus), stream, gpu_index);
-  cuda_synchronize_device(gpu_index);
-
-  std::vector<uint32_t> non_matching_indexes;
-  for (int i = 0; i < buf_cpu1.size(); ++i) {
-    if (buf_cpu1[i] != buf_cpu2[i]) {
-      non_matching_indexes.push_back(i);
-    }
-  }
-
-  if (!non_matching_indexes.empty()) {
-    std::stringstream ss;
-    for (int i = 0; i < std::min(non_matching_indexes.size(), (size_t)10);
-         ++i) {
-      ss << "    difference at " << non_matching_indexes[i] << ": "
-         << buf_cpu1[non_matching_indexes[i]] << " vs "
-         << buf_cpu2[non_matching_indexes[i]] << " at index "
-         << non_matching_indexes[i] << "\n";
-    }
-    GPU_ASSERT(non_matching_indexes.empty(),
-               "Correctness error for matrices %d x %d: \n%s", row_size,
-               col_size, ss.str().c_str());
-  }
-}
-
-#endif
+#endif // KERNEL_DIMENSIONS_H
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_debug.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_debug.cuh
@@ -1,6 +1,12 @@
+#ifndef HELPER_DEBUG_CUH
+#define HELPER_DEBUG_CUH
+
 #include "cuComplex.h"
 #include "thrust/complex.h"
+#include <cstdint>
 #include <iostream>
+#include <sstream>
+#include <stdio.h>
 #include <string>
 #include <type_traits>

@@ -98,3 +104,130 @@ __device__
  }
  __syncthreads();
 }
+
+template <typename T> __device__ inline const char *get_format();
+
+template <> __device__ inline const char *get_format<int>() { return "%d, "; }
+
+template <> __device__ inline const char *get_format<unsigned int>() {
+  return "%u, ";
+}
+
+template <> __device__ inline const char *get_format<uint64_t>() {
+  return "%lu, ";
+}
+
+template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
+  for (int i = 0; i < N; i++) {
+    printf(get_format<T>(), src[i]);
+  }
+}
+
+template <>
+__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
+  for (int i = 0; i < N; i++) {
+    uint64_t low = static_cast<uint64_t>(src[i]);
+    uint64_t high = static_cast<uint64_t>(src[i] >> 64);
+    printf("(%llu, %llu), ", high, low);
+  }
+}
+
+template <>
+__global__ inline void print_debug_kernel(const double2 *src, int N) {
+  for (int i = 0; i < N; i++) {
+    printf("(%lf, %lf), ", src[i].x, src[i].y);
+  }
+}
+template <typename T> void print_debug(const char *name, const T *src, int N) {
+  printf("%s: ", name);
+  cudaDeviceSynchronize();
+  print_debug_kernel<<<1, 1>>>(src, N);
+  cudaDeviceSynchronize();
+  printf("\n");
+}
+
+template <typename T>
+__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
+  for (int i = 0; i < N; i++) {
+    T body = src[i * (lwe_dimension + 1) + lwe_dimension];
+    T clear = body / delta;
+    printf("(%lu, %lu), ", body, clear);
+  }
+}
+
+template <typename T>
+void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
+  printf("%s: ", name);
+  cudaDeviceSynchronize();
+  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
+  cudaDeviceSynchronize();
+  printf("\n");
+}
+
+template <typename Torus>
+void print_2d_csv_to_file(const std::vector<Torus> &v, int col_size,
+                          const char *fname) {
+  FILE *fp = fopen(fname, "wt");
+  for (int i = 0; i < v.size() / col_size; ++i) {
+    for (int j = 0; j < col_size; ++j) {
+      fprintf(fp, "%lu%c", v[i * col_size + j],
+              (j == col_size - 1) ? '\n' : ',');
+    }
+  }
+  fclose(fp);
+}
+
+template <typename Torus>
+__host__ void dump_2d_gpu_to_file(const Torus *ptr, int row_size, int col_size,
+                                  const char *fname_prefix, int rand_prefix,
+                                  cudaStream_t stream, uint32_t gpu_index) {
+  // #ifndef NDEBUG
+  std::vector<Torus> buf_cpu(row_size * col_size);
+
+  char fname[4096];
+  snprintf(fname, 4096, "%s_%d_%d_%d.csv", fname_prefix, row_size, col_size,
+           rand_prefix);
+
+  cuda_memcpy_async_to_cpu((void *)&buf_cpu[0], ptr,
+                           buf_cpu.size() * sizeof(Torus), stream, gpu_index);
+  cuda_synchronize_device(gpu_index);
+  print_2d_csv_to_file(buf_cpu, col_size, fname);
+  // #endif
+}
+
+template <typename Torus>
+__host__ void compare_2d_arrays(const Torus *ptr1, const Torus *ptr2,
+                                int row_size, int col_size, cudaStream_t stream,
+                                uint32_t gpu_index) {
+  // #ifndef NDEBUG
+  std::vector<Torus> buf_cpu1(row_size * col_size),
+      buf_cpu2(row_size * col_size);
+  ;
+  cuda_memcpy_async_to_cpu((void *)&buf_cpu1[0], ptr1,
+                           buf_cpu1.size() * sizeof(Torus), stream, gpu_index);
+  cuda_memcpy_async_to_cpu((void *)&buf_cpu2[0], ptr2,
+                           buf_cpu2.size() * sizeof(Torus), stream, gpu_index);
+  cuda_synchronize_device(gpu_index);
+
+  std::vector<uint32_t> non_matching_indexes;
+  for (int i = 0; i < buf_cpu1.size(); ++i) {
+    if (buf_cpu1[i] != buf_cpu2[i]) {
+      non_matching_indexes.push_back(i);
+    }
+  }
+
+  if (!non_matching_indexes.empty()) {
+    std::stringstream ss;
+    for (int i = 0; i < std::min(non_matching_indexes.size(), (size_t)10);
+         ++i) {
+      ss << "    difference at " << non_matching_indexes[i] << ": "
+         << buf_cpu1[non_matching_indexes[i]] << " vs "
+         << buf_cpu2[non_matching_indexes[i]] << " at index "
+         << non_matching_indexes[i] << "\n";
+    }
+    GPU_ASSERT(non_matching_indexes.empty(),
+               "Correctness error for matrices %d x %d: \n%s", row_size,
+               col_size, ss.str().c_str());
+  }
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -1,4 +1,5 @@
 #include "device.h"
+#include "helper.cuh"
 #include "helper_multi_gpu.cuh"
 #include <mutex>
 #include <omp.h>
@@ -7,38 +8,7 @@ std::mutex m;
 bool p2p_enabled = false;
 const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS = 12;
 const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS = 68;
-
-// Enable bidirectional p2p access between all available GPUs and device_0_id
-int32_t cuda_setup_multi_gpu(int device_0_id) {
-  int num_gpus = cuda_get_number_of_gpus();
-  if (num_gpus == 0)
-    PANIC("GPU error: the number of GPUs should be > 0.")
-  int num_used_gpus = 1;
-  if (num_gpus > 1) {
-    m.lock();
-    if (!p2p_enabled) {
-      p2p_enabled = true;
-      omp_set_nested(1);
-      int has_peer_access_to_device_0;
-      for (int i = 1; i < num_gpus; i++) {
-        check_cuda_error(cudaDeviceCanAccessPeer(&has_peer_access_to_device_0,
-                                                 i, device_0_id));
-        if (has_peer_access_to_device_0) {
-          cuda_set_device(i);
-          check_cuda_error(cudaDeviceEnablePeerAccess(device_0_id, 0));
-          cuda_set_device(device_0_id);
-          check_cuda_error(cudaDeviceEnablePeerAccess(i, 0));
-        }
-        num_used_gpus += 1;
-      }
-    } else {
-      for (int i = 1; i < num_gpus; i++)
-        num_used_gpus += 1;
-    }
-    m.unlock();
-  }
-  return (int32_t)(num_used_gpus);
-}
+const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128 = 12;

 uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                              PBS_TYPE pbs_type) {
@@ -46,7 +16,21 @@ uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
  uint32_t ceil_div_inputs =
-      std::max((uint32_t)1, (num_inputs + threshold - 1) / threshold);
+      std::max((uint32_t)1, CEIL_DIV(num_inputs, (uint32_t)threshold));
+  uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
+  return active_gpu_count;
+}
+
+// For pbs 128 we need to use the smaller threshold in both multi bit and
+// classical
+uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
+                                   PBS_TYPE pbs_type) {
+  int threshold = (pbs_type == MULTI_BIT)
+                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
+
+  uint32_t ceil_div_inputs =
+      std::max((uint32_t)1, CEIL_DIV(num_inputs, (uint32_t)threshold));
  uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
  return active_gpu_count;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -30,6 +30,9 @@ void multi_gpu_copy_array_async(CudaStreams streams,
                                const std::vector<Torus *> &dest,
                                Torus const *src, uint32_t elements_per_gpu,
                                bool gpu_memory_allocated) {
+  PANIC_IF_FALSE(
+      dest.size() >= streams.count(),
+      "Cuda error: destination vector was not allocated for enough GPUs");
  for (uint i = 0; i < streams.count(); i++) {
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        dest[i], src, elements_per_gpu * sizeof(Torus), streams.stream(i),
@@ -62,10 +65,12 @@ void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
                               PBS_TYPE pbs_type, bool allocate_gpu_memory) {
  PANIC_IF_FALSE(dest.empty(),
                 "Cuda error: Requested multi-GPU vector is already allocated");
-
+  int classical_threshold = sizeof(Torus) == 16
+                                ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+                                : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
  int threshold = (pbs_type == MULTI_BIT)
                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                      : classical_threshold;

  dest.resize(streams.count());
  for (uint i = 0; i < streams.count(); i++) {
@@ -100,10 +105,12 @@ void multi_gpu_alloc_lwe_many_lut_output_async(

  PANIC_IF_FALSE(dest.empty(),
                 "Cuda error: Requested multi-GPU vector is already allocated");
-
+  int classical_threshold = sizeof(Torus) == 16
+                                ? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
+                                : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
  int threshold = (pbs_type == MULTI_BIT)
                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
-                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+                      : classical_threshold;

  dest.resize(streams.count());
  for (uint i = 0; i < streams.count(); i++) {
@@ -158,13 +165,11 @@ __global__ void realign_with_indexes(Torus *d_vector,
 /// The output indexing is always the trivial one
 /// num_inputs: total num of lwe in src
 template <typename Torus>
-void multi_gpu_scatter_lwe_async(CudaStreams streams,
-                                 const std::vector<Torus *> &dest,
-                                 Torus const *src, Torus const *d_src_indexes,
-                                 bool is_trivial_index,
-                                 std::vector<Torus *> &aligned_vec,
-                                 uint32_t max_active_gpu_count,
-                                 uint32_t num_inputs, uint32_t lwe_size) {
+void multi_gpu_scatter_lwe_async(
+    CudaStreams streams, const std::vector<Torus *> &dest, Torus const *src,
+    Torus const *d_src_indexes, bool is_trivial_index,
+    std::vector<Torus *> &aligned_vec, CudaEventPool &event_pool,
+    uint32_t max_active_gpu_count, uint32_t num_inputs, uint32_t lwe_size) {

  PANIC_IF_FALSE(
      max_active_gpu_count >= streams.count(),
@@ -187,13 +192,16 @@ void multi_gpu_scatter_lwe_async(CudaStreams streams,
          streams.stream(i), streams.gpu_index(i), true);

    } else {
-      if (aligned_vec.size() == 0)
-        PANIC("Cuda error: auxiliary arrays should be setup!");
+      PANIC_IF_FALSE(aligned_vec.size() > 0,
+                     "Cuda error: auxiliary arrays should be setup!");
+      PANIC_IF_FALSE(
+          aligned_vec.size() >= streams.count(),
+          "Cuda error: aligned vec was not allocated for enough GPUs");

      if (d_src_indexes == nullptr)
        PANIC("Cuda error: source indexes should be initialized!");

-      cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
+      cudaEvent_t temp_event2 = event_pool.request_event(streams.gpu_index(0));
      cuda_set_device(streams.gpu_index(0));
      align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams.stream(0)>>>(
          aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
@@ -207,7 +215,7 @@ void multi_gpu_scatter_lwe_async(CudaStreams streams,
          dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
          streams.stream(i), streams.gpu_index(i), true);

-      cudaEvent_t temp_event = cuda_create_event(streams.gpu_index(i));
+      cudaEvent_t temp_event = event_pool.request_event(streams.gpu_index(i));
      cuda_event_record(temp_event, streams.stream(i), streams.gpu_index(i));
      cuda_stream_wait_event(streams.stream(0), temp_event,
                             streams.gpu_index(0));
@@ -223,7 +231,8 @@ void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
                                const std::vector<Torus *> &src,
                                Torus *d_dest_indexes, bool is_trivial_index,
                                std::vector<Torus *> &aligned_vec,
-                                uint32_t num_inputs, uint32_t lwe_size) {
+                                CudaEventPool &event_pool, uint32_t num_inputs,
+                                uint32_t lwe_size) {

  PANIC_IF_FALSE(src.size() >= streams.count(),
                 "Cuda error: src vector was not allocated for enough GPUs");
@@ -242,12 +251,15 @@ void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
          d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
          streams.stream(i), streams.gpu_index(i), true);
    } else {
-      if (aligned_vec.size() == 0)
-        PANIC("Cuda error: auxiliary arrays should be setup!");
+      PANIC_IF_FALSE(aligned_vec.size() > 0,
+                     "Cuda error: auxiliary arrays should be setup!");
+      PANIC_IF_FALSE(
+          aligned_vec.size() >= streams.count(),
+          "Cuda error: aligned vec was not allocated for enough GPUs");
      if (d_dest_indexes == nullptr)
        PANIC("Cuda error: destination indexes should be initialized!");

-      cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
+      cudaEvent_t temp_event2 = event_pool.request_event(streams.gpu_index(0));

      cuda_event_record(temp_event2, streams.stream(0), streams.gpu_index(0));
      cuda_stream_wait_event(streams.stream(i), temp_event2,
@@ -257,7 +269,7 @@ void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
          aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
          streams.stream(i), streams.gpu_index(i), true);

-      cudaEvent_t temp_event3 = cuda_create_event(streams.gpu_index(i));
+      cudaEvent_t temp_event3 = event_pool.request_event(streams.gpu_index(i));
      cuda_event_record(temp_event3, streams.stream(i), streams.gpu_index(i));
      cuda_stream_wait_event(streams.stream(0), temp_event3,
                             streams.gpu_index(0));
@@ -302,6 +314,8 @@ void multi_gpu_gather_many_lut_lwe_async(CudaStreams streams, Torus *dest,
            d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
            streams.stream(i), streams.gpu_index(i), true);
      } else {
+        if (h_dest_indexes == nullptr)
+          PANIC("Cuda error: destination indexes should be initialized!");
        auto dest_indexes = h_dest_indexes + gpu_offset;

        for (uint j = 0; j < inputs_on_gpu; j++) {
@@ -322,6 +336,8 @@ void multi_gpu_gather_many_lut_lwe_async(CudaStreams streams, Torus *dest,
 template <typename Torus>
 void multi_gpu_release_async(CudaStreams streams, std::vector<Torus *> &vec) {

+  PANIC_IF_FALSE(vec.size() >= streams.count(),
+                 "Cuda error: vec was not allocated for enough GPUs");
  for (uint i = 0; i < vec.size(); i++)
    cuda_drop_async(vec[i], streams.stream(i), streams.gpu_index(i));
 }
--- a/backends/tfhe-cuda-backend/cuda/src/utils/kernel_dimensions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/kernel_dimensions.cuh
@@ -1,32 +0,0 @@
-#ifndef KERNEL_DIMENSIONS_CUH
-#define KERNEL_DIMENSIONS_CUH
-
-inline int nextPow2(int x) {
-  --x;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  return ++x;
-}
-
-inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
-                                   int &blocks, int &threads) {
-  threads =
-      (n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
-  blocks = (n + threads - 1) / threads;
-}
-
-// Determines blocks and threads in x for a given blockDim.y using the same
-// logic than above
-inline void getNumBlocksAndThreads2D(const int n, const int maxBlockSize,
-                                     const int block_dim_y, int &blocks,
-                                     int &threads_x) {
-  const int max_block_dim_x = maxBlockSize / block_dim_y;
-  threads_x = (n < max_block_dim_x * 2) ? max(128, nextPow2((n + 1) / 2))
-                                        : max_block_dim_x;
-  blocks = (n + threads_x - 1) / threads_x;
-}
-
-#endif // KERNEL_DIMENSIONS_H
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
@@ -8,10 +8,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
    uint32_t casting_output_dimension, uint32_t casting_ks_level,
    uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
-    const bool *is_boolean_array, uint32_t num_compact_lists,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    KS_TYPE casting_key_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
+    const bool *is_boolean_array, const uint32_t is_boolean_array_len,
+    uint32_t num_compact_lists, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

  // Since CUDA backend works with the concept of "big" and "small" key, instead
  // of "input" and "output", we need to do this or otherwise our PBS will throw
@@ -36,8 +36,9 @@ uint64_t scratch_cuda_expand_without_verification_64(
  return scratch_cuda_expand_without_verification<uint64_t>(
      CudaStreams(streams),
      reinterpret_cast<zk_expand_mem<uint64_t> **>(mem_ptr),
-      num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
-      computing_params, casting_params, casting_key_type, allocate_gpu_memory);
+      num_lwes_per_compact_list, is_boolean_array, is_boolean_array_len,
+      num_compact_lists, computing_params, casting_params, casting_key_type,
+      allocate_gpu_memory);
 }

 void cuda_expand_without_verification_64(
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -11,10 +11,73 @@
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
-#include "utils/kernel_dimensions.cuh"
 #include "zk/zk_utilities.h"
 #include <functional>

+/*
+ * =============================================================================
+ * GPU Expand Algorithm: Overview
+ * =============================================================================
+ *
+ * The expand algorithm transforms compact ciphertexts into standard LWE
+ * ciphertexts. Compact ciphertexts save space by sharing a single mask across
+ * multiple encrypted messages.
+ *
+ * -----------------------------------------------------------------------------
+ * INPUT STRUCTURE (lwe_flattened_compact_array_in)
+ * -----------------------------------------------------------------------------
+ *
+ * A contiguous array of concatenated compact ciphertext lists:
+ *
+ *  ┌─────────────────────────────────────────────────────────────────────────┐
+ *  │            lwe_flattened_compact_array_in (GPU memory)                  │
+ *  └─────────────────────────────────────────────────────────────────────────┘
+ *  ┌────────────────────────────────┬────────────────────────────────────────┐
+ *  │      Compact List 0            │           Compact List 1 │...
+ *  └────────────────────────────────┴────────────────────────────────────────┘
+ *
+ *  Each compact list structure:
+ *  ┌─────────────────────────────────────────────┬────────┬────────┬─────────┐
+ *  │  Shared Mask (lwe_dimension coefficients)   │ Body 0 │ Body 1 │ Body 2
+ * │... │  [a_0, a_1, ..., a_{n-1}]                   │  b_0   │  b_1   │  b_2 │
+ *  └─────────────────────────────────────────────┴────────┴────────┴─────────┘
+ *  │<────────── lwe_dimension ──────────────────>│<── num_lwes_in_list ─────>│
+ *
+ * -----------------------------------------------------------------------------
+ * EXPAND PROCESS
+ * -----------------------------------------------------------------------------
+ *
+ * 1. LWE Expansion (lwe_expand kernel):
+ *    Each (mask, body_i) pair becomes a standard LWE by rotating the mask
+ *    by i positions: LWE_i = (rotate(mask, i), body_i)
+ *
+ * 2. Message/Carry Extraction (PBS with LUTs):
+ *    Each expanded LWE contains packed data. PBS extracts both parts:
+ *
+ *    Input LWE_i ──PBS──> Output[2i]   (message extraction LUT)
+ *                   └───> Output[2i+1] (carry extraction LUT)
+ *
+ *    For boolean values, sanitization LUTs clamp output to {0, 1}.
+ *
+ * -----------------------------------------------------------------------------
+ * OUTPUT STRUCTURE (lwe_array_out)
+ * -----------------------------------------------------------------------------
+ *
+ *  ┌─────────────────────────────────────────────────────────────────────────┐
+ *  │             lwe_array_out (2 * num_lwes standard LWEs)                  │
+ *  └─────────────────────────────────────────────────────────────────────────┘
+ *  ┌──────────────┬──────────────┬──────────────┬──────────────┬─────────────┐
+ *  │  LWE 0 (msg) │ LWE 0 (carry)│  LWE 1 (msg) │ LWE 1 (carry)│    ...      │
+ *  └──────────────┴──────────────┴──────────────┴──────────────┴─────────────┘
+ *
+ *  Each output LWE: [mask (lwe_dimension), body (1)] = lwe_dimension + 1
+ * elements
+ *
+ * See zk_utilities.h for detailed documentation on the is_boolean array and
+ * LUT indexing logic.
+ * =============================================================================
+ */
+
 template <typename Torus, class params>
 __host__ void host_expand_without_verification(
    CudaStreams streams, Torus *lwe_array_out,
@@ -107,15 +170,15 @@ template <typename Torus>
 __host__ uint64_t scratch_cuda_expand_without_verification(
    CudaStreams streams, zk_expand_mem<Torus> **mem_ptr,
    const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
-    uint32_t num_compact_lists, int_radix_params computing_params,
-    int_radix_params casting_params, KS_TYPE casting_key_type,
-    bool allocate_gpu_memory) {
+    const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
+    int_radix_params computing_params, int_radix_params casting_params,
+    KS_TYPE casting_key_type, bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new zk_expand_mem<Torus>(
      streams, computing_params, casting_params, casting_key_type,
-      num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
-      allocate_gpu_memory, size_tracker);
+      num_lwes_per_compact_list, is_boolean_array, is_boolean_array_len,
+      num_compact_lists, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp
@@ -67,7 +67,7 @@ public:
    number_of_inputs = (int)GetParam().number_of_inputs;

    // Enable Multi-GPU logic
-    gpu_count = cuda_setup_multi_gpu(0);
+    gpu_count = cuda_get_number_of_gpus();
    active_gpu_count = std::min((uint)number_of_inputs, gpu_count);
    for (uint gpu_i = 0; gpu_i < active_gpu_count; gpu_i++) {
      streams.push_back(cuda_create_stream(gpu_i));
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2533,6 +2533,7 @@ unsafe extern "C" {
        grouping_factor: u32,
        num_lwes_per_compact_list: *const u32,
        is_boolean_array: *const bool,
+        is_boolean_array_len: u32,
        num_compact_lists: u32,
        message_modulus: u32,
        carry_modulus: u32,
@@ -2711,7 +2712,7 @@ unsafe extern "C" {
    );
 }
 unsafe extern "C" {
-    pub fn cuda_closest_representable_64(
+    pub fn cuda_closest_representable_64_async(
        stream: *mut ffi::c_void,
        gpu_index: u32,
        input: *const ffi::c_void,
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -13,27 +13,10 @@ extern "C" {

    pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;

-    pub fn cuda_malloc_with_size_tracking_async(
-        size: u64,
-        stream: *mut c_void,
-        gpu_index: u32,
-        size_tracker: *mut u64,
-        allocate_gpu_memory: bool,
-    ) -> *mut c_void;
-
    pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
    pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
    pub fn cuda_device_total_memory(gpu_index: u32) -> u64;

-    pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
-        dest: *mut c_void,
-        src: *const c_void,
-        size: u64,
-        stream: *mut c_void,
-        gpu_index: u32,
-        gpu_memory_allocated: bool,
-    );
-
    pub fn cuda_memcpy_async_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
@@ -44,15 +27,6 @@ extern "C" {

    pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32);

-    pub fn cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
-        dest: *mut c_void,
-        src: *const c_void,
-        size: u64,
-        stream: *mut c_void,
-        gpu_index: u32,
-        gpu_memory_allocated: bool,
-    );
-
    pub fn cuda_memcpy_async_gpu_to_gpu(
        dest: *mut c_void,
        src: *const c_void,
@@ -69,15 +43,6 @@ extern "C" {
        gpu_index: u32,
    );

-    pub fn cuda_memset_with_size_tracking_async(
-        dest: *mut c_void,
-        val: u64,
-        size: u64,
-        stream: *mut c_void,
-        gpu_index: u32,
-        gpu_memory_allocated: bool,
-    );
-
    pub fn cuda_memset_async(
        dest: *mut c_void,
        val: u64,
@@ -94,16 +59,4 @@ extern "C" {

    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);

-    pub fn cuda_drop_with_size_tracking_async(
-        ptr: *mut c_void,
-        stream: *mut c_void,
-        gpu_index: u32,
-        size_tracker: *mut u64,
-        allocate_gpu_memory: bool,
-    );
-
-    pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
-
-    pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32;
-
 } // extern "C"
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -100,3 +100,9 @@ environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
 flavor_name = "n3-L40x4"
 user = "ubuntu"
+
+[backend.hyperstack.4-l40_fallback]
+environment_name = "norway"
+image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
+flavor_name = "n3-RTX-A4000x4"
+user = "ubuntu"
--- a/scripts/check_memory_errors.sh
+++ b/scripts/check_memory_errors.sh
@@ -28,6 +28,9 @@ if [[ "${RUN_VALGRIND}" == "0" && "${RUN_COMPUTE_SANITIZER}" == "0" ]]; then
  exit 1
 fi

+# Array to collect error messages for final summary
+ERROR_MESSAGES=()
+
 # List the tests into a temporary file
 RUSTFLAGS="$RUSTFLAGS" cargo nextest list --cargo-profile "${CARGO_PROFILE}" \
          --features=integer,internal-keycache,gpu-debug,zk-pok -p tfhe &> /tmp/test_list.txt
@@ -46,14 +49,28 @@ if [[ "${RUN_VALGRIND}" == "1" ]]; then
  # Find the test executable -> last one to have been modified
  EXECUTABLE=target/release/deps/$(find target/release/deps/ -type f -executable -name "tfhe-*" -printf "%T@ %f\n" |sort -nr|sed 's/^.* //; q;')

-  # shellcheck disable=SC2181
  RESULT=0
  while read -r t; do
        [ -z "$t" ] && continue
        echo "Running valgrind on: $t"
+
        VALGRIND_EXIT=0
-        valgrind --leak-check=full --show-leak-kinds=definite "$EXECUTABLE" -- "$t" || VALGRIND_EXIT=$?
+        valgrind --leak-check=full \
+            --show-leak-kinds=definite,indirect \
+            --errors-for-leak-kinds=definite,indirect \
+            --error-exitcode=1 \
+            "$EXECUTABLE" -- "$t" 2>&1 | tee /tmp/valgrind_output.log || VALGRIND_EXIT=$?
+
+        # Fail if the test crashed (non-zero exit code from valgrind)
        if [[ $VALGRIND_EXIT -ne 0 ]]; then
+            ERROR_MESSAGES+=("Test crashed or valgrind returned error for test: $t")
+            RESULT=1
+        fi
+
+        # Also fail if memory errors reference tfhe/cuda code (not system libraries)
+        if grep -E "definitely lost|indirectly lost|Invalid read|Invalid write|Invalid free|Mismatched free" /tmp/valgrind_output.log | \
+           grep -q "tfhe\|cuda"; then
+            ERROR_MESSAGES+=("Memory error detected in tfhe/cuda code for test: $t")
            RESULT=1
        fi
  done <<< "$TESTS_TO_RUN"
@@ -81,9 +98,22 @@ if [[ "${RUN_COMPUTE_SANITIZER}" == "1" ]]; then
            --error-exitcode=1 --target-processes=all \
            "$EXECUTABLE" -- "$t" || CS_EXIT=$?
        if [[ $CS_EXIT -ne 0 ]]; then
+            ERROR_MESSAGES+=("Compute-sanitizer detected error for test: $t")
            RESULT=1
        fi
    done <<< "$TESTS_TO_RUN"
 fi

+# Print summary of errors if any were encountered
+if [[ ${#ERROR_MESSAGES[@]} -gt 0 ]]; then
+    echo ""
+    echo "========================================"
+    echo "MEMORY ERROR SUMMARY"
+    echo "========================================"
+    for msg in "${ERROR_MESSAGES[@]}"; do
+        echo "  - $msg"
+    done
+    echo "========================================"
+fi
+
 exit $RESULT
--- a/tests/backward_compatibility/high_level_api.rs
+++ b/tests/backward_compatibility/high_level_api.rs
@@ -3,9 +3,7 @@ use crate::{load_and_unversionize, TestedModule};
 use std::path::Path;
 #[cfg(feature = "zk-pok")]
 use tfhe::integer::parameters::DynamicDistribution;
-use tfhe::prelude::{
-    CiphertextList, FheDecrypt, FheEncrypt, ParameterSetConformant, ReRandomize, SquashNoise,
-};
+use tfhe::prelude::*;
 #[cfg(feature = "zk-pok")]
 use tfhe::shortint::parameters::{
    CompactCiphertextListExpansionKind, CompactPublicKeyEncryptionParameters,
@@ -13,27 +11,29 @@ use tfhe::shortint::parameters::{
 #[cfg(feature = "zk-pok")]
 use tfhe::shortint::prelude::LweDimension;
 use tfhe::shortint::{CarryModulus, CiphertextModulus, MessageModulus};
+use tfhe::xof_key_set::CompressedXofKeySet;
 #[cfg(feature = "zk-pok")]
 use tfhe::zk::{CompactPkeCrs, CompactPkeCrsConformanceParams};
+#[cfg(feature = "zk-pok")]
+use tfhe::ProvenCompactCiphertextList;
 use tfhe::{
-    set_server_key, ClientKey, CompactCiphertextList, CompressedCiphertextList,
+    set_server_key, ClientKey, CompactCiphertextList, CompactCiphertextListBuilder,
+    CompactPublicKey, CompressedCiphertextList, CompressedCiphertextListBuilder,
    CompressedCompactPublicKey, CompressedFheBool, CompressedFheInt8, CompressedFheUint8,
    CompressedKVStore, CompressedPublicKey, CompressedServerKey,
-    CompressedSquashedNoiseCiphertextList, FheBool, FheInt8, FheUint64, FheUint8,
-    ReRandomizationContext, ServerKey, SquashedNoiseFheBool, SquashedNoiseFheInt,
-    SquashedNoiseFheUint,
+    CompressedSquashedNoiseCiphertextList, CompressedSquashedNoiseCiphertextListBuilder, FheBool,
+    FheInt8, FheUint32, FheUint64, FheUint8, ReRandomizationContext, ServerKey,
+    SquashedNoiseFheBool, SquashedNoiseFheInt, SquashedNoiseFheUint,
 };
-#[cfg(feature = "zk-pok")]
-use tfhe::{CompactPublicKey, ProvenCompactCiphertextList};
 use tfhe_backward_compat_data::load::{
    load_versioned_auxiliary, DataFormat, TestFailure, TestResult, TestSuccess,
 };
 use tfhe_backward_compat_data::{
    DataKind, HlBoolCiphertextTest, HlCiphertextTest, HlClientKeyTest, HlCompressedKVStoreTest,
-    HlCompressedSquashedNoiseCiphertextListTest, HlHeterogeneousCiphertextListTest,
-    HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest, HlSquashedNoiseBoolCiphertextTest,
-    HlSquashedNoiseSignedCiphertextTest, HlSquashedNoiseUnsignedCiphertextTest, TestMetadata,
-    TestType, Testcase, ZkPkePublicParamsTest,
+    HlCompressedSquashedNoiseCiphertextListTest, HlCompressedXofKeySetTest,
+    HlHeterogeneousCiphertextListTest, HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest,
+    HlSquashedNoiseBoolCiphertextTest, HlSquashedNoiseSignedCiphertextTest,
+    HlSquashedNoiseUnsignedCiphertextTest, TestMetadata, TestType, Testcase, ZkPkePublicParamsTest,
 };
 use tfhe_versionable::Unversionize;

@@ -360,6 +360,155 @@ pub fn test_hl_pubkey(
    }
 }

+/// Shared feature-testing logic for server keys: computation, re-randomization, noise squashing,
+/// compression, and compressed noise-squashed lists.
+fn test_hl_key_features(
+    client_key: &ClientKey,
+    server_key: ServerKey,
+    compact_public_key: Option<&CompactPublicKey>,
+    test: &impl TestType,
+    format: DataFormat,
+) -> Result<(), TestFailure> {
+    set_server_key(server_key.clone());
+
+    let clear_a = 278120u32;
+    let clear_b = 839412u32;
+
+    let (mut a, mut b) = match compact_public_key {
+        Some(pk) => {
+            let compact_list = CompactCiphertextListBuilder::new(pk)
+                .push(clear_a)
+                .push(clear_b)
+                .build_packed();
+
+            let expanded = compact_list
+                .expand()
+                .map_err(|e| test.failure(format!("Failed to expand: {e}"), format))?;
+            let a: FheUint32 = expanded.get(0).unwrap().unwrap();
+            let b: FheUint32 = expanded.get(1).unwrap().unwrap();
+            (a, b)
+        }
+        None => {
+            let a = FheUint32::encrypt(clear_a, client_key);
+            let b = FheUint32::encrypt(clear_b, client_key);
+            (a, b)
+        }
+    };
+
+    // Re-randomization
+    if let (Some(pk), true) = (
+        compact_public_key,
+        server_key.supports_ciphertext_re_randomization(),
+    ) {
+        let nonce: [u8; 256 / 8] = core::array::from_fn(|i| i as u8);
+        let mut re_rand_context = ReRandomizationContext::new(
+            *b"TFHE_Rrd",
+            [b"FheUint32 bin ops".as_slice(), nonce.as_slice()],
+            *b"TFHE_Enc",
+        );
+
+        re_rand_context.add_ciphertext(&a);
+        re_rand_context.add_ciphertext(&b);
+
+        let mut seed_gen = re_rand_context.finalize();
+
+        a.re_randomize(pk, seed_gen.next_seed().unwrap())
+            .map_err(|e| test.failure(format!("Failed to re-randomize a: {e}"), format))?;
+        b.re_randomize(pk, seed_gen.next_seed().unwrap())
+            .map_err(|e| test.failure(format!("Failed to re-randomize b: {e}"), format))?;
+    }
+
+    // Computation
+    let c = &a + &b;
+    let d = &a & &b;
+
+    let expected_c = clear_a.wrapping_add(clear_b);
+    let expected_d = clear_a & clear_b;
+
+    for (val, expected) in [&c, &d].iter().zip([expected_c, expected_d]) {
+        let dec: u32 = val.decrypt(client_key);
+        if dec != expected {
+            return Err(test.failure(
+                format!("Invalid decryption: expected {expected}, got {dec}"),
+                format,
+            ));
+        }
+    }
+
+    // Noise squashing
+    if server_key.supports_noise_squashing() {
+        let ns_c = c
+            .squash_noise()
+            .map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
+        let ns_d = d
+            .squash_noise()
+            .map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
+
+        for (ns_val, expected) in [&ns_c, &ns_d].iter().zip([expected_c, expected_d]) {
+            let dec: u32 = ns_val.decrypt(client_key);
+            if dec != expected {
+                return Err(test.failure(
+                    format!("Invalid noise-squashed decryption: expected {expected}, got {dec}"),
+                    format,
+                ));
+            }
+        }
+
+        if server_key.supports_noise_squashing_compression() {
+            // Compressed noise-squashed ciphertext list
+            let ns_compressed_list = CompressedSquashedNoiseCiphertextListBuilder::new()
+                .push(ns_c)
+                .push(ns_d)
+                .build()
+                .map_err(|e| {
+                    test.failure(
+                        format!("Failed to build compressed squashed noise list: {e}"),
+                        format,
+                    )
+                })?;
+
+            for (i, expected) in [expected_c, expected_d].iter().enumerate() {
+                let val: SquashedNoiseFheUint = ns_compressed_list.get(i).unwrap().unwrap();
+                let dec: u32 = val.decrypt(client_key);
+                if dec != *expected {
+                    return Err(test.failure(
+                        format!(
+                            "Invalid compressed noise-squashed[{i}]: \
+                             expected {expected}, got {dec}"
+                        ),
+                        format,
+                    ));
+                }
+            }
+        }
+    }
+
+    // Compression / decompression
+    if server_key.supports_compression() {
+        let compressed_list = CompressedCiphertextListBuilder::new()
+            .push(a)
+            .push(b)
+            .push(c)
+            .push(d)
+            .build()
+            .map_err(|e| test.failure(format!("Failed to build compressed list: {e}"), format))?;
+
+        let expected_values = [clear_a, clear_b, expected_c, expected_d];
+        for (i, expected) in expected_values.iter().enumerate() {
+            let val: FheUint32 = compressed_list.get(i).unwrap().unwrap();
+            let dec: u32 = val.decrypt(client_key);
+            if dec != *expected {
+                return Err(test.failure(
+                    format!("Invalid decompressed[{i}]: expected {expected}, got {dec}"),
+                    format,
+                ));
+            }
+        }
+    }
+
+    Ok(())
+}
+
 /// Test HL server key: encrypt two values with a client key, add them using the server key and
 /// check that the decrypted sum is valid.
 pub fn test_hl_serverkey(
@@ -373,11 +522,6 @@ pub fn test_hl_serverkey(
    )
    .map_err(|e| test.failure(e, format))?;

-    let v1 = 73u8;
-    let mut ct1 = FheUint8::encrypt(v1, &client_key);
-    let v2 = 102u8;
-    let ct2 = FheUint8::encrypt(v2, &client_key);
-
    let key = if test.compressed {
        let compressed: CompressedServerKey = load_and_unversionize(dir, test, format)?;
        compressed.decompress()
@@ -385,77 +529,20 @@ pub fn test_hl_serverkey(
        load_and_unversionize(dir, test, format)?
    };

-    let has_noise_squashing = key.supports_noise_squashing();
-    let has_rerand = key.supports_ciphertext_re_randomization();
-    set_server_key(key);
-
-    if has_noise_squashing {
-        let ns = ct1.squash_noise().unwrap();
-        let res: u8 = ns.decrypt(&client_key);
-        if res != v1 {
-            return Err(test.failure(
-                format!(
-                    "Invalid result for noise squashing using loaded server key, expected {v1} got {res}",
-                ),
-                format,
-            ));
-        }
-    }
-
-    if let Some(rerand_cpk_filename) = test.rerand_cpk_filename.as_ref() {
-        if has_rerand {
-            let rerand_cpk_file = dir.join(rerand_cpk_filename.to_string());
-            let public_key = CompressedCompactPublicKey::unversionize(
-                load_versioned_auxiliary(rerand_cpk_file).map_err(|e| test.failure(e, format))?,
+    let compact_public_key = test
+        .rerand_cpk_filename
+        .as_ref()
+        .map(|filename| {
+            let cpk_file = dir.join(filename.to_string());
+            CompressedCompactPublicKey::unversionize(
+                load_versioned_auxiliary(cpk_file).map_err(|e| test.failure(e, format))?,
            )
-            .map_err(|e| test.failure(e, format))?
-            .decompress();
+            .map_err(|e| test.failure(e, format))
+            .map(|cpk| cpk.decompress())
+        })
+        .transpose()?;

-            let nonce: [u8; 256 / 8] = rand::random();
-            let mut re_rand_context = ReRandomizationContext::new(
-                *b"TFHE_Rrd",
-                [b"FheUint8".as_slice(), nonce.as_slice()],
-                *b"TFHE_Enc",
-            );
-
-            re_rand_context.add_ciphertext(&ct1);
-            let mut seed_gen = re_rand_context.finalize();
-
-            ct1.re_randomize(&public_key, seed_gen.next_seed().unwrap())
-                .unwrap();
-
-            #[allow(clippy::eq_op)]
-            let rrd = &ct1 & &ct1;
-            let res: u8 = rrd.decrypt(&client_key);
-            if res != v1 {
-                return Err(test.failure(
-                    format!(
-                    "Invalid result for rerand using loaded server key, expected {v1} got {res}",
-                ),
-                    format,
-                ));
-            }
-        } else {
-            return Err(test.failure(
-                "Test requires rerand key but server key does not have it".to_string(),
-                format,
-            ));
-        }
-    }
-
-    let ct_sum = ct1 + ct2;
-    let sum: u8 = ct_sum.decrypt(&client_key);
-
-    if sum != v1 + v2 {
-        return Err(test.failure(
-            format!(
-                "Invalid result for addition using loaded server key, expected {} got {}",
-                v1 + v2,
-                sum,
-            ),
-            format,
-        ));
-    }
+    test_hl_key_features(&client_key, key, compact_public_key.as_ref(), test, format)?;

    Ok(test.success(format))
 }
@@ -659,6 +746,39 @@ fn test_hl_compressed_kv_store_test(
    Ok(test.success(format))
 }

+fn test_hl_compressed_xof_key_set_test(
+    dir: &Path,
+    test: &HlCompressedXofKeySetTest,
+    format: DataFormat,
+) -> Result<TestSuccess, TestFailure> {
+    let client_key_file = dir.join(&*test.client_key_file_name);
+    let client_key = ClientKey::unversionize(
+        load_versioned_auxiliary(client_key_file).map_err(|e| test.failure(e, format))?,
+    )
+    .map_err(|e| test.failure(format!("Failed to load client key file: {e}"), format))?;
+
+    let compressed_xof_key_set_file = dir.join(&*test.compressed_xof_key_set_file_name);
+    let compressed_xof_key_set = CompressedXofKeySet::unversionize(
+        load_versioned_auxiliary(compressed_xof_key_set_file)
+            .map_err(|e| test.failure(e, format))?,
+    )
+    .map_err(|e| {
+        test.failure(
+            format!("Failed to load compressed xof key set file: {e}"),
+            format,
+        )
+    })?;
+
+    let xof_key_set = compressed_xof_key_set
+        .decompress()
+        .map_err(|e| test.failure(format!("Failed to decompress the xof key set: {e}"), format))?;
+
+    let (pk, server_key) = xof_key_set.into_raw_parts();
+
+    test_hl_key_features(&client_key, server_key, Some(&pk), test, format)?;
+
+    Ok(test.success(format))
+}
 pub struct Hl;

 impl TestedModule for Hl {
@@ -711,6 +831,9 @@ impl TestedModule for Hl {
            TestMetadata::HlCompressedKVStoreTest(test) => {
                test_hl_compressed_kv_store_test(test_dir.as_ref(), test, format).into()
            }
+            TestMetadata::HlCompressedXofKeySet(test) => {
+                test_hl_compressed_xof_key_set_test(test_dir.as_ref(), test, format).into()
+            }
            _ => {
                println!("WARNING: missing test: {:?}", testcase.metadata);
                TestResult::Skipped(testcase.skip())
--- a/tfhe-benchmark/benches/integer/rerand.rs
+++ b/tfhe-benchmark/benches/integer/rerand.rs
@@ -197,10 +197,10 @@ mod cuda {
    use tfhe::integer::ciphertext::ReRandomizationContext;
    use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
    use tfhe::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
+    use tfhe::integer::gpu::key_switching_key::CudaKeySwitchingKeyMaterial;
    use tfhe::integer::key_switching_key::KeySwitchingKey;
    use tfhe::integer::{gen_keys_radix, CompactPrivateKey, CompactPublicKey};
    use tfhe::keycache::NamedParam;
-    use tfhe::shortint::key_switching_key::CudaKeySwitchingKeyMaterial;

    fn execute_gpu_re_randomize(c: &mut Criterion, bit_size: usize) {
        let bench_name = "integer::cuda::re_randomize";
--- a/tfhe-csprng/Cargo.toml
+++ b/tfhe-csprng/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-csprng"
-version = "0.8.0"
+version = "0.8.1"
 edition = "2021"
 license = "BSD-3-Clause-Clear"
 description = "Cryptographically Secure PRNG used in the TFHE-rs library."
@@ -25,6 +25,8 @@ libc = "0.2.133"
 rand = { workspace = true }
 criterion = "0.5.1"
 clap = "=4.5.30"
+ctr = "0.9.2"
+

 [features]
 parallel = ["rayon"]
--- a/tfhe-csprng/src/generators/aes_ctr/block_cipher.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/block_cipher.rs
@@ -9,7 +9,7 @@ use crate::generators::aes_ctr::{AES_CALLS_PER_BATCH, BYTES_PER_AES_CALL, BYTES_
 /// needs to be loaded with [u128::from_le] (to keep consistency of the loaded bytes across systems
 /// endianness), the rest of the code should use the [`AesKey`] with native endian ordering such
 /// that the internal u128 is equivalent to [u8; 16].
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug)]
 pub(crate) struct AesKey(pub(crate) u128);

 /// A trait for AES block ciphers.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Testé	9d18c6c3d8	chore(ci): update slab-github-runner action to v1.6.0 This action version now uses node24 as runner since node20 support is dropped on April 2026.	2026-04-01 09:47:54 +02:00
Thomas Montaigu	638f4c8bab	chore: add compressed xof key set backward to 1.5	2026-03-11 17:26:14 +01:00
Thomas Montaigu	509afb0967	refactor: use div_ceil to compute public seed bytes Coupled with the assert this ensures at least one byte for the seed.	2026-03-11 17:26:14 +01:00
Thomas Montaigu	da3d966fba	feat: impl Tagged for [Compressed]XofKeySet	2026-03-11 17:26:14 +01:00
Thomas Montaigu	fa396a7e80	chore: bump tfhe-csprng to 0.8.1	2026-03-11 17:26:14 +01:00
Thomas Montaigu	f92a5267d2	fix(csprng): fix offset being lost during fork The offset was not passed to children, and the parent 'lost' it. Meaning that after a fork using a generator initialized with XofSeed, after a fork, the offset was lost for both the children and parent. Refactor the initialization chain and methods of the AesCtrGenerator to simplify things. (Although the struct is pub, it's in a private module thus it's not public API) The fix is to pass the offset. * A test to check that the parent correctly continues after the fork has been added * Tests now pass a random offset to make sure this case is properly handled	2026-03-11 17:26:14 +01:00
Nicolas Sarlin	d76fef0d82	chore: bump tfhe to 1.5.4	2026-03-03 15:03:40 +01:00
Nicolas Sarlin	1f403aade1	fix(integer): reject packed legacy compact list	2026-03-03 10:17:40 +01:00
David Testé	081bb0f332	chore(ci): update slab-github-runner action to v1.5.1 This version adds randomization on sleep duration between calls to GitHub API when looking for runner registration. This reduces the risk of API rate-limiting.	2026-02-12 09:27:13 +01:00
David Testé	c8b19c3f10	chore(ci): update slab-github-runner action to v1.5.0 This new version improves handling of asynchronous tasks related to Slab and GitHub API.	2026-02-11 13:59:49 +01:00
Agnes Leroy	7408ca1126	chore(gpu): add fallback for 4-l40 in CI	2026-02-11 13:59:49 +01:00
Arthur Meyre	8712746296	chore: bump TFHE-rs to 1.5.3	2026-02-11 13:59:49 +01:00
Arthur Meyre	271644f482	chore: bump tfhe-cuda-backend to 0.13.2	2026-02-11 13:59:49 +01:00
Arthur Meyre	5a7650a2de	chore: revert backward compatibility change - this change should not have been needed and poses risks for backward compatibility - HL CUDA: use dedicated type for the ReRand	2026-02-11 13:59:49 +01:00
Guillermo Oyarzun	a3a5b6fe40	fix(gpu): avoid unspecified behavior during the reduction	2026-02-11 13:55:47 +01:00
Pedro Alves	2210d637c4	fix(gpu): remove unused number_bits_to_unpack initialization in host_extract	2026-02-09 17:03:09 +01:00
Pedro Alves	1e5a38acb0	fix(gpu): assert all items have same shape in from_vec_cuda_lwe_ciphertexts_list	2026-02-09 17:03:09 +01:00
Pedro Alves	f6b08c45e9	fix(gpu): assert exactly one element in CudaLweCiphertextList::into_lwe_ciphertext	2026-02-09 17:03:09 +01:00
Pedro Alves	6145f6b680	fix(gpu): use lwe_compact_ciphertext_list_size for validation in CudaLweCompactCiphertextList::from_d_vec	2026-02-09 17:03:09 +01:00
Agnes Leroy	0d6602df84	chore(gpu): improve checks in device code	2026-02-09 17:03:09 +01:00
Agnes Leroy	a006d5d71f	fix(gpu): fix logic in lwe ciphertext list	2026-02-09 17:03:09 +01:00
Pedro Alves	a7c5f2dbf8	chore(gpu): replaces (a + b - 1) / b patterns in the cuda backend by CEIL_DIV - also, this commit renames kernel_dimensions.cuh to helper.cuh and copies the content of the older helper.cuh into helper_debug.cuh	2026-02-09 17:03:09 +01:00
Agnes Leroy	0816c331c5	chore(gpu): add some missing checks in core crypto	2026-02-09 17:03:09 +01:00
Agnes Leroy	89df44917b	chore(gpu): cleanup device.cu binding, remove _async fuinctions from core crypto	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	b218c98194	fix(gpu): clean unused variables in specialized classical pbs	2026-02-09 17:03:09 +01:00
Andrei Stoian	2355cf4d89	fix(gpu): valgrind error on leaks	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	3b793273cf	feat(gpu): create different threshold for multi-gpu pbs128	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	d3b52d92cb	fix(gpu): fix race condition in tbc implementations	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	af7d69e16c	feat(gpu): avoid register spilling memory in ff128	2026-02-09 17:03:09 +01:00
Guillermo Oyarzun	6bb211f2fd	feat(gpu): use 512 threads for pbs128 flavors	2026-02-09 17:03:09 +01:00
Agnes Leroy	0fcce501a2	fix(gpu): fix small cpu memory leak	2026-02-09 17:03:09 +01:00
Agnes Leroy	204555e11c	fix(gpu): fix noise level in match value	2026-02-09 17:03:09 +01:00
Arthur Meyre	b00e8bafe5	chore: bump TFHE-rs to version 1.5.2	2026-02-09 13:05:01 +01:00
Arthur Meyre	140a780bb6	chore: bump tfhe-cuda-backend to 0.13.1	2026-02-09 13:05:01 +01:00
Thomas Montaigu	e646130324	chore: move shortint expanded types into shortint mod The expanded types definitions were in the high level API as it was originally related to the XofKeySet feature. However, since it's now used even in non-xof setting we decided to move these types to shortint module where they conceptually belong	2026-02-09 09:14:25 +01:00
Thomas Montaigu	f79204e23e	feat(hlapi): add is_conformant for CompressedXofKeySet	2026-02-09 09:14:25 +01:00
Thomas Montaigu	b6b24f4fb9	fix(xofkeyset): generate multibit decompression key when params are multibit	2026-02-09 09:14:25 +01:00
Thomas Montaigu	41621dc4e3	fix(conformance): GGSW list had wrong group count	2026-02-09 09:14:25 +01:00
Thomas Montaigu	662038da19	chore(xof_key_set): make generate_with_pre_seeded_generator public MPC teams needs to be able to generate a CompressedXofKeySet from an existing ClientKey	2026-02-09 09:14:25 +01:00
Thomas Montaigu	9d21aecae9	feat(hlapi): add decompress_to_gpu for CompressedXofKeySet Done using the newly added expand + convert pattern	2026-02-09 09:14:25 +01:00
Thomas Montaigu	58dbdf7dd4	refactor(hlapi): add IntegerExpandedServerKey::convert_to_gpu And use it to convert from CompressedServerKey to CudaServerKey.	2026-02-09 09:14:25 +01:00
Thomas Montaigu	1a7b7ace47	refactor(hlapi): split gpu key conversion in expand/convert converting from CompressedServerKey (Cpu) to CudaServerKey was done via decompress_from_cpu/decompress_to_cuda methods. We refactor to split these functions in 2: one that converts from cpu (input in std domain for bootstrap keys), the other that decompress/expand then calls convert	2026-02-09 09:14:25 +01:00
Thomas Montaigu	7797b60ef2	refactor(hlapi): add expand() method to CompressedServerKey Add an expand method to CompressedServerKey that returns an IntegerExpandedServerKey. Refactor decompress() to use the new expand() then convert_to_cpu(). This will allow later to refactor the convertion from CompressedServerKey to CudaServerKey to follow a similar pattern, meaning we will be able to share the code that converts keys from CPU to GPU between the normal server key and the xof server key	2026-02-09 09:14:25 +01:00
Thomas Montaigu	56c0a9fa5e	fix(hlapi-gpu): correct state of post decompression noise squashed ct The GPU part did not set the correct state This problem is sort of rare to encounter as generally a CompressedSquashedNoiseCiphertextList is deserialized then used. When deserialized its on the CPU, so calls to `get` use CPU code which correctly set the state. This problem is thus visible when either: - safe_deserializing and manually moving the list to GPU - deserialize - directly expanding after the creation of the list	2026-02-09 09:14:25 +01:00
Thomas Montaigu	37bcb7763d	fix(hlapi): return error when trying to expand non packed list	2026-02-09 09:14:25 +01:00
Thomas Montaigu	ae978bedc6	refactor(xof_key_set): split decompression into expansion and conversion Introduce IntegerExpandedServerKey as an intermediate representation between compressed (seeded) keys and backend-specific formats. Decompression is now a two-step process: 1. Seed expansion: decompress seeded keys into standard domain representations (e.g., LweBootstrapKey instead of FourierLweBootstrapKey) 2. Backend conversion: convert to target backend format (CPU Fourier, GPU, etc.) This separation allows sharing the expansion step across backends while specializing only the final conversion, as for this XOF based expansion the order is important Changes: - Split xof_key_set.rs into module structure (mod.rs, internal.rs, test.rs) - Add intermediate types, that contains the expanded, but not converted data	2026-02-09 09:14:25 +01:00
David Testé	43ebb14b9b	doc: fix specs description of aws hpc7a.96xlarge instance These instances have two sockets, each equipped with a 96-core CPU.	2026-02-06 12:25:07 +01:00
David Testé	641b394423	doc: add svg tables to zero-knowledge benchmarks This removes the embedded GSheet. These SVGs display more operations and inputs that reflect real-use cases. Throughput is available only for server-side computation as it's meaningless to perform multiple proof in parallele on the client-side.	2026-02-02 18:05:08 +01:00
Beka Barbakadze	72c18cc2b2	fix(gpu): add __syncthread and threadIdx condition for sample_extract_body in all pbs versions	2026-01-30 20:21:15 +01:00
Pedro Alves	71e95c435d	chore(gpu): add test for single-item compact ciphertext list expand	2026-01-30 20:21:15 +01:00
Andrei Stoian	057217b7d2	fix(gpu): more crypto param checks in cuda backend	2026-01-30 20:21:15 +01:00
Beka Barbakadze	f006039916	refactor(gpu): refactor f128 and fft128 to prevent possible precision losses and improve performance	2026-01-30 20:21:15 +01:00
Pedro Alves	69df8c0310	fix(gpu): fix some inconsistencies in decompression that could enable access of not allocated memory - also adds a diagram explaining how compression / decompression work	2026-01-30 20:21:15 +01:00
Agnes Leroy	482e49eac5	fix(gpu): add check on nullptr for dest_indexes in many lut gather	2026-01-29 15:58:18 +01:00
Guillermo Oyarzun	0dcf2ace3b	fix(gpu): handling temporary events destruction	2026-01-29 15:58:18 +01:00
Pedro Alves	0483c95941	fix(gpu): fix an invalid access in expand when the number of LWEs is odd - also improves test_expander_length_matches_data_items - adds diagrams and explanations about GPU's expand	2026-01-29 15:58:18 +01:00
Guillermo Oyarzun	f9b292ca77	fix(gpu): add panic for 32-bit Torus calls	2026-01-29 15:58:18 +01:00
Agnes Leroy	8e194c4b65	chore(gpu): fix logic to check ptr validity in device.cu	2026-01-29 15:58:18 +01:00
Agnes Leroy	f02bba151c	chore(gpu): stop trying to enable NVlink since we don't use it	2026-01-29 15:58:18 +01:00
Agnes Leroy	a477548161	chore(gpu): add comments in device.cu	2026-01-29 15:58:18 +01:00
David Testé	45a7d5217c	chore(docs): update benchmark results for all backends	2026-01-26 17:08:40 +01:00
Andrei Stoian	865c0887ee	fix(gpu): mutex lock	2026-01-26 11:20:54 +01:00
Agnes Leroy	0b9df1c8cd	doc: add erc20 benchmark results for all backends	2026-01-26 11:20:54 +01:00
Pedro Alves	cf20e337ef	fix(gpu): fix an inconsistency between CudaCompactCiphertextListExpander::len() and the CPU equivalent	2026-01-23 15:37:18 +01:00
Agnes Leroy	d7aea61eba	fix(gpu): fix potential overflow in create_on_same_gpus	2026-01-23 15:37:18 +01:00
Arthur Meyre	9352ecce54	chore: bump version to 1.5.1	2026-01-20 14:17:55 +01:00
Arthur Meyre	179b52ab26	feat: add missing raw parts APIs for shortint (Compressed)DecompressionKey	2026-01-20 14:17:55 +01:00