chore(docs): fix the bench table

refactor(tfhe)!: update key level order for better performance
- use natural order for decomposition levels in bsk - contains a copy of tfhe-versionable 0.3.2 - updates zk-pok to 0.3.2 - updates the usage to avoid warnings in tfhe-zk-pok co-authored-by: Agnes Leroy <agnes.leroy@zama.ai>
2026-04-28 03:01:21 -04:00 · 2024-12-09 08:35:17 +01:00 · 2024-11-06 13:12:12 +01:00 · 2024-11-06 13:12:12 +01:00 · 2024-11-06 13:12:12 +01:00 · 2024-11-06 13:12:12 +01:00
565 changed files with 40957 additions and 20858 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,6 +47,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -103,7 +104,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -26,6 +26,7 @@ jobs:
    outputs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -56,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -64,10 +65,15 @@ jobs:
              - tfhe/Cargo.toml
              - concrete-csprng/**
              - tfhe-zk-pok/**
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
            csprng:
              - concrete-csprng/**
            zk_pok:
              - tfhe-zk-pok/**
+            versionable:
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -103,6 +109,7 @@ jobs:
        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.versionable_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -124,7 +131,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -147,6 +154,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -167,6 +175,11 @@ jobs:
        run: |
          make test_zk_pok

+      - name: Run tfhe-versionable tests
+        if: needs.should-run.outputs.versionable_test == 'true'
+        run: |
+          make test_versionable
+
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -213,7 +226,7 @@ jobs:
          make test_safe_deserialization

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -228,7 +241,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -19,28 +19,60 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
-  schedule:
-    # Nightly tests @ 3AM after each work day
-    - cron: "0 3 * * MON-FRI"

 jobs:
+  should-run:
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
+        steps.changed-files.outputs.integer_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          persist-credentials: "false"
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            integer:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+
  setup-instance:
    name: Setup instance (unsigned-integer-tests)
-    if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
-        github.event_name == 'workflow_dispatch'
+    needs: should-run
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +92,8 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -93,7 +126,7 @@ jobs:
          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -103,12 +136,12 @@ jobs:
  teardown-instance:
    name: Teardown instance (unsigned-integer-tests)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, unsigned-integer-tests ]
+    needs: [setup-instance, unsigned-integer-tests]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -19,28 +19,60 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
-  schedule:
-    # Nightly tests @ 3AM after each work day
-    - cron: "0 3 * * MON-FRI"

 jobs:
+  should-run:
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
+        steps.changed-files.outputs.integer_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          persist-credentials: "false"
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            integer:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+
  setup-instance:
-    name: Setup instance (signed-integer-tests)
-    if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-        (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
-        github.event_name == 'workflow_dispatch'
+    name: Setup instance (unsigned-integer-tests)
+    needs: should-run
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +92,8 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
-          persist-credentials: 'false'
+          persist-credentials: "false"
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -97,7 +130,7 @@ jobs:
          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -107,12 +140,12 @@ jobs:
  teardown-instance:
    name: Teardown instance (signed-integer-tests)
    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
-    needs: [ setup-instance, signed-integer-tests ]
+    needs: [setup-instance, signed-integer-tests]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -63,7 +63,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -131,7 +131,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -154,6 +154,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -222,7 +223,7 @@ jobs:
          make test_kreyvium

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -237,7 +238,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,6 +48,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -58,9 +59,11 @@ jobs:
        with:
          toolchain: stable

-      - name: Install Node
+      - name: Install web resources
        run: |
          make install_node
+          make install_chrome_browser
+          make install_chrome_web_driver

      - name: Run fmt checks
        run: |
@@ -72,10 +75,10 @@ jobs:

      - name: Run parallel wasm tests
        run: |
-          make test_web_js_api_parallel_ci
+          make test_web_js_api_parallel_chrome_ci

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -90,7 +93,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,6 +51,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -98,7 +99,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
@@ -113,16 +114,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -140,7 +133,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,6 +47,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -86,7 +87,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -101,16 +102,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -128,7 +121,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -42,6 +42,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -82,7 +83,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -90,19 +91,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -164,7 +157,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -184,7 +177,7 @@ jobs:
          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
-        if: ${{ !success() && !cancelled() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,6 +68,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -128,7 +129,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -143,16 +144,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
@@ -175,7 +168,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer.yml
+++ b/.github/workflows/benchmark_gpu_integer.yml
@@ -30,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -71,6 +71,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -124,7 +125,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -144,7 +145,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -159,16 +160,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
@@ -191,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_2H100_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_2H100_full.yml
@@ -0,0 +1,194 @@
+# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Integer 2xH100 benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-integer-full-2-gpu-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: 2-h100
+
+  cuda-integer-full-2-gpu-benchmarks:
+    name: Execute 2xH100 integer benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        command: [integer_multi_bit]
+        op_flavor: [default]
+        # explicit include-based build matrix, of known valid options
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "n3-H100x2" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-integer-full-2-gpu-benchmarks.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU 2xH100 benchmarks finished with status: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-2-gpu-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-2-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_integer_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_full.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
      max-parallel: 1
      matrix:
        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]
+        op_flavor: [default]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -75,6 +75,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -129,6 +130,12 @@ jobs:
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu

+      # Run these benchmarks only once
+      - name: Run compression benchmarks with AVX512
+        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
+        run: |
+          make bench_integer_compression_gpu
+
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -144,7 +151,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -152,22 +159,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-full-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -184,7 +183,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_multi_bit.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit.yml
@@ -42,7 +42,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,6 +84,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -147,7 +148,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -167,7 +168,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -182,23 +183,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -215,7 +207,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
@@ -42,7 +42,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -85,6 +85,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -164,7 +165,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
@@ -172,22 +173,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -204,7 +197,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -39,7 +39,7 @@ jobs:
          profile: multi-h100

  cuda-integer-full-multi-gpu-benchmarks:
-    name: Execute multi GPU integer benchmarks for all operations flavor
+    name: Execute multi GPU integer benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
@@ -48,8 +48,8 @@ jobs:
      fail-fast: false
      max-parallel: 1
      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]
+        command: [integer_multi_bit]
+        op_flavor: [default]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -75,6 +75,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -144,7 +145,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -152,22 +153,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -184,7 +177,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_l40.yml
+++ b/.github/workflows/benchmark_gpu_l40.yml
@@ -0,0 +1,206 @@
+# Run benchmarks on an L40 VM and return parsed results to Slab CI bot.
+name: Cuda benchmarks (L40)
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-l40-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: l40 
+
+  cuda-l40-benchmarks:
+    name: Cuda benchmarks (L40)
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        command: [integer_multi_bit]
+        op_flavor: [default]
+        # explicit include-based build matrix, of known valid options
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+
+      - name: Run compression benchmarks with AVX512
+        run: |
+          make bench_integer_compression_gpu
+
+      - name: Run PBS benchmarks 
+        run: |
+          make bench_pbs_gpu
+
+      - name: Run KS benchmarks 
+        run: |
+          make bench_ks_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "n3-L40x1" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-l40-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-l40-benchmarks.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-l40-benchmarks.result }}
+          SLACK_MESSAGE: "Cuda benchmarks (L40) finished with status: ${{ needs.cuda-l40-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-l40-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-l40-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-l40-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_cpu_benchmark.yml
+++ b/.github/workflows/integer_cpu_benchmark.yml
@@ -62,7 +62,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,6 +90,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -125,6 +126,12 @@ jobs:
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}

+      # Run these benchmarks only once
+      - name: Run compression benchmarks with AVX512
+        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
+        run: |
+          make bench_integer_compression
+
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -139,7 +146,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -147,16 +154,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -174,7 +173,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/shortint_cpu_benchmark.yml
+++ b/.github/workflows/shortint_cpu_benchmark.yml
@@ -56,7 +56,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -82,6 +82,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -141,7 +142,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -149,16 +150,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -176,7 +169,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/signed_integer_cpu_benchmark.yml
+++ b/.github/workflows/signed_integer_cpu_benchmark.yml
@@ -62,7 +62,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,6 +90,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -139,7 +140,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -147,16 +148,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -174,7 +167,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -39,7 +39,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -64,7 +64,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,6 +83,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -102,10 +103,15 @@ jobs:
        with:
          toolchain: nightly

-      - name: Run benchmarks
+      - name: Install web resources
        run: |
          make install_node
-          make bench_web_js_api_parallel_ci
+          make install_chrome_browser
+          make install_chrome_web_driver
+
+      - name: Run benchmarks
+        run: |
+          make bench_web_js_api_parallel_chrome_ci

      - name: Parse results
        run: |
@@ -130,7 +136,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}
@@ -145,16 +151,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -172,7 +170,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -36,7 +36,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
    if: needs.setup-instance.result != 'skipped'
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -87,6 +87,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -138,7 +139,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
+        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
        with:
          name: ${{ github.sha }}_integer_zk
          path: ${{ env.RESULTS_FILENAME }}
@@ -153,19 +154,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ !success() && !cancelled() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -180,7 +173,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -19,14 +19,21 @@ jobs:

    strategy:
      matrix:
-        os: [large_ubuntu_16, macos-latest-large, large_windows_16_latest]
+        # GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
+        # even with a few PRs
+        os: [large_ubuntu_16, macos-latest, windows-latest]
      fail-fast: false

    steps:
      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
      - name: Install and run newline linter checks
-        if: matrix.os == 'ubuntu-latest'
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
          echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
@@ -36,27 +43,33 @@ jobs:
          make check_newline

      - name: Run pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make pcc

      - name: Build concrete-csprng
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_concrete_csprng

      - name: Build Release core
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_core AVX512_SUPPORT=ON
          make build_core_experimental AVX512_SUPPORT=ON

      - name: Build Release boolean
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_boolean

      - name: Build Release shortint
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_shortint

      - name: Build Release integer
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_integer

@@ -65,10 +78,12 @@ jobs:
          make build_tfhe_full

      - name: Build Release c_api
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_c_api

      - name: Build coverage tests
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_tfhe_coverage

--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -25,3 +25,9 @@ jobs:
      - name: Lint workflows
        run: |
          make lint_workflow
+
+      - name: Ensure SHA pinned actions
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@0901cf7b71c7ea6261ec69a3dc2bd3f9264f893e # v3.0.12
+        with:
+          allowlist: |
+            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -57,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          files_yaml: |
            tfhe:
@@ -125,7 +125,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,6 +48,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -78,7 +79,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -37,6 +37,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -74,7 +75,7 @@ jobs:
          github_token: ${{ secrets.GITHUB_TOKEN }}

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,7 +44,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/shortint/parameters/**
+              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,6 +111,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -146,7 +147,8 @@ jobs:

      - name: Run core crypto and internal CUDA backend tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=FALSE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=FALSE make test_integer_compression_gpu
          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

      - name: Run user docs tests
@@ -165,7 +167,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -182,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -33,7 +33,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -43,7 +43,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/shortint/parameters/**
+              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -63,7 +63,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -109,6 +109,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -145,6 +146,7 @@ jobs:
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
+          make test_integer_compression_gpu
          make test_cuda_backend

      - name: Run user docs tests
@@ -163,7 +165,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -180,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -0,0 +1,156 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Full tests on H100
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+
+on:
+  workflow_dispatch:
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-h100-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-tests-linux:
+    name: CUDA H100 tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-h100-tests)
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,7 +44,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/shortint/parameters/**
+              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,6 +111,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -144,6 +145,10 @@ jobs:
        if: ${{ !cancelled() }}
        run: nvidia-smi

+      - name: Run multi-bit CUDA integer compression tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
+
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
@@ -165,7 +170,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -182,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -56,6 +56,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -94,7 +95,7 @@ jobs:
          make pcc_gpu

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -109,7 +110,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -31,10 +31,11 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,7 +45,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/shortint/parameters/**
+              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -65,7 +66,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -154,7 +155,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -171,7 +172,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -42,7 +42,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -52,7 +52,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/shortint/parameters/**
+              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -73,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,6 +119,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -168,7 +169,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -185,7 +186,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -31,10 +31,11 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -44,7 +45,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/shortint/parameters/**
+              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -65,7 +66,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -154,7 +155,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -171,7 +172,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -38,10 +38,11 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -51,7 +52,7 @@ jobs:
              - backends/tfhe-cuda-backend/**
              - tfhe/src/core_crypto/gpu/**
              - tfhe/src/integer/gpu/**
-              - tfhe/shortint/parameters/**
+              - tfhe/src/shortint/parameters/**
              - tfhe/src/high_level_api/**
              - tfhe/src/c_api/**
              - 'tfhe/docs/**.md'
@@ -72,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -165,7 +166,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-unsigned-integer-tests ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -182,7 +183,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -42,7 +42,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe
-      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -29,14 +29,14 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: aws
-          profile: gpu-test
+          profile: gpu-build

  publish-cuda-release:
    name: Publish CUDA Release
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,8 @@ backends/tfhe-cuda-backend/cuda/cmake-build-debug/

 # WASM tests
 tfhe/web_wasm_parallel_tests/server.PID
+venv/
+web_test_runners/

 # Dir used for backward compatibility test data
 tfhe/tfhe-backward-compat-data/
--- a/160
+++ b/160
@@ -18,13 +18,15 @@ FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 NIGHTLY_TESTS?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
-NODE_VERSION=22.4
+NODE_VERSION=22.6
 FORWARD_COMPAT?=OFF
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_BRANCH?=v0.1
+BACKWARD_COMPAT_DATA_BRANCH?=v0.4
 BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
+WEB_RUNNER_DIR=web-test-runner
+WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
 # copy paste the command in the terminal and change them if required without forgetting the flags
 export RUSTFLAGS?=-C target-cpu=native
@@ -146,6 +148,37 @@ install_tfhe_lints:
 	(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
 	cd utils/cargo-tfhe-lints && cargo install --path .

+.PHONY: setup_venv # Setup Python virtualenv for wasm tests
+setup_venv:
+	python3 -m venv venv
+	@source venv/bin/activate && \
+	pip3 install -r ci/webdriver_requirements.txt
+
+# This is an internal target, not meant to be called on its own.
+install_web_resource:
+	wget -P $(dest) $(url)
+	@cd $(dest) && \
+	echo "$(checksum) $(filename)" > checksum && \
+	sha256sum -c checksum && \
+	rm checksum && \
+	unzip $(filename)
+
+install_chrome_browser: url = "https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.69/linux64/chrome-linux64.zip"
+install_chrome_browser: checksum = "f789d53911a50cfa4a2bc1f09cde57567247f52515436d92b1aa9de93c2787d0"
+install_chrome_browser: dest = "$(WEB_RUNNER_DIR)/chrome"
+install_chrome_browser: filename = "chrome-linux64.zip"
+
+.PHONY: install_chrome_browser # Install Chrome browser for Linux
+install_chrome_browser: install_web_resource
+
+install_chrome_web_driver: url = "https://storage.googleapis.com/chrome-for-testing-public/130.0.6723.69/linux64/chromedriver-linux64.zip"
+install_chrome_web_driver: checksum = "90fe8dedf33eefe4b72704f626fa9f5834427c042235cfeb4251f18c9f0336ea"
+install_chrome_web_driver: dest = "$(WEB_RUNNER_DIR)/chrome"
+install_chrome_web_driver: filename = "chromedriver-linux64.zip"
+
+.PHONY: install_chrome_web_driver # Install Chrome web driver for Linux
+install_chrome_web_driver: install_web_resource
+
 .PHONY: check_linelint_installed # Check if linelint newline linter is installed
 check_linelint_installed:
 	@printf "\n" | linelint - > /dev/null 2>&1 || \
@@ -214,6 +247,13 @@ clippy_gpu: install_rs_check_toolchain
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

+.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
+check_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
+		--all-targets \
+		-p $(TFHE_SPEC)
+
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -252,12 +292,18 @@ clippy_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),shortint \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_integer # Run clippy lints enabling the integer features
 clippy_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),integer \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),integer,experimental \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
 clippy: install_rs_check_toolchain
@@ -284,6 +330,9 @@ clippy_c_api: install_rs_check_toolchain

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
@@ -303,6 +352,9 @@ clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,experimental \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
 clippy_concrete_csprng: install_rs_check_toolchain
@@ -315,9 +367,17 @@ clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok -- --no-deps -D warnings

+.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
+clippy_versionable: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-versionable-derive -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-versionable -- --no-deps -D warnings
+
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
-clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
+clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium \
+clippy_versionable

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -331,7 +391,7 @@ clippy_cuda_backend: install_rs_check_toolchain
 .PHONY: tfhe_lints # Run custom tfhe-rs lints
 tfhe_lints: install_tfhe_lints
 	cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -- -D warnings
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok -- -D warnings

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -419,6 +479,7 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
 		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
 		-Z build-std=panic_abort,std && \
 	find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
+	jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json

 .PHONY: build_node_js_api # Build the js API targeting nodejs
 build_node_js_api: install_rs_build_toolchain install_wasm_pack
@@ -470,17 +531,31 @@ test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu:: --test-threads=1
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu:: --test-threads=1

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

+.PHONY: test_integer_compression
+test_integer_compression: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compressed_ciphertext_list::tests::
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compress
+
+.PHONY: test_integer_compression_gpu
+test_integer_compression_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests:: --test-threads=1
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress --test-threads=1
+
 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -734,7 +809,7 @@ test_zk_pok: install_rs_build_toolchain
 .PHONY: test_versionable # Run tests for tfhe-versionable subcrate
 test_versionable: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe-versionable
+		--all-targets -p tfhe-versionable

 # The backward compat data repo holds historical binary data but also rust code to generate and load them.
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
@@ -742,7 +817,7 @@ test_versionable: install_rs_build_toolchain
 test_backward_compatibility_ci: install_rs_build_toolchain
 	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
-		--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
+		--features=$(TARGET_ARCH_FEATURE),shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
 test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
@@ -833,18 +908,35 @@ test_nodejs_wasm_api_in_docker: build_nodejs_test_docker

 .PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
 test_nodejs_wasm_api: build_node_js_api
-	cd tfhe/js_on_wasm_tests && npm run test
+	cd tfhe/js_on_wasm_tests && npm install && npm run test

-.PHONY: test_web_js_api_parallel # Run tests for the web wasm api
-test_web_js_api_parallel: build_web_js_api_parallel
-	$(MAKE) -C tfhe/web_wasm_parallel_tests test

-.PHONY: test_web_js_api_parallel_ci # Run tests for the web wasm api
-test_web_js_api_parallel_ci: build_web_js_api_parallel
+# This is an internal target, not meant to be called on its own.
+run_web_js_api_parallel: build_web_js_api_parallel setup_venv
+	cd $(WEB_SERVER_DIR) && npm install && npm run build
+	source venv/bin/activate && \
+	python ci/webdriver.py \
+	--browser-path $(browser_path) \
+	--driver-path $(driver_path) \
+	--browser-kind  $(browser_kind) \
+	--server-cmd "npm run server" \
+	--server-workdir "$(WEB_SERVER_DIR)" \
+	--id-pattern $(filter)
+
+test_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
+test_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
+test_web_js_api_parallel_chrome: browser_kind = chrome
+test_web_js_api_parallel_chrome: filter = Test
+
+.PHONY: test_web_js_api_parallel_chrome # Run tests for the web wasm api
+test_web_js_api_parallel_chrome: run_web_js_api_parallel
+
+.PHONY: test_web_js_api_parallel_chrome_ci # Run tests for the web wasm api
+test_web_js_api_parallel_chrome_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests test-ci
+	$(MAKE) test_web_js_api_parallel_chrome

 .PHONY: no_tfhe_typo # Check we did not invert the h and f in tfhe
 no_tfhe_typo:
@@ -862,6 +954,11 @@ dieharder_csprng: install_dieharder build_concrete_csprng
 # Benchmarks
 #

+.PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
+print_doc_bench_parameters:
+	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe
+
 .PHONY: bench_integer # Run benchmarks for unsigned integer
 bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
@@ -883,6 +980,18 @@ bench_integer_gpu: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

+.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
+bench_integer_compression: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench	glwe_packing_compression-integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+
+.PHONY: bench_integer_compression_gpu
+bench_integer_compression_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench	glwe_packing_compression-integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -981,15 +1090,20 @@ bench_ks_gpu: install_rs_check_toolchain
 	--bench ks-bench \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

-.PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
-bench_web_js_api_parallel: build_web_js_api_parallel
-	$(MAKE) -C tfhe/web_wasm_parallel_tests bench
+bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
+bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
+bench_web_js_api_parallel_chrome: browser_kind = chrome
+bench_web_js_api_parallel_chrome: filter = Bench

-.PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
-bench_web_js_api_parallel_ci: build_web_js_api_parallel
+.PHONY: bench_web_js_api_parallel_chrome # Run benchmarks for the web wasm api
+bench_web_js_api_parallel_chrome: run_web_js_api_parallel
+
+.PHONY: bench_web_js_api_parallel_chrome_ci # Run benchmarks for the web wasm api
+bench_web_js_api_parallel_chrome_ci: setup_venv
 	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci
+	$(MAKE) bench_web_js_api_parallel_chrome

 #
 # Utility tools
@@ -1037,7 +1151,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example wasm_benchmarks_parser \
 	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
-	-- web_wasm_parallel_tests/test/benchmark_results
+	-- wasm_benchmark_results.json

 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
 write_params_to_file: install_rs_check_toolchain
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a> 
--- a/apps/trivium/src/kreyvium/kreyvium.rs
+++ b/apps/trivium/src/kreyvium/kreyvium.rs
@@ -148,10 +148,9 @@ where

    /// Computes one turn of the stream, updating registers and outputting the new bit.
    pub fn next_bool(&mut self) -> T {
-        match &self.fhe_key {
-            Some(sk) => set_server_key(sk.clone()),
-            None => (),
-        };
+        if let Some(sk) = &self.fhe_key {
+            set_server_key(sk.clone());
+        }

        let [o, a, b, c] = self.get_output_and_values(0);

@@ -226,18 +225,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let mut values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut ret = Vec::<T>::with_capacity(64);
--- a/apps/trivium/src/kreyvium/kreyvium_byte.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_byte.rs
@@ -237,18 +237,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut bytes = Vec::<T>::with_capacity(8);
--- a/apps/trivium/src/lib.rs
+++ b/apps/trivium/src/lib.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::too_long_first_doc_paragraph)]
+
 mod static_deque;

 mod kreyvium;
--- a/apps/trivium/src/trivium/trivium_bool.rs
+++ b/apps/trivium/src/trivium/trivium_bool.rs
@@ -120,10 +120,9 @@ where

    /// Computes one turn of the stream, updating registers and outputting the new bit.
    pub fn next_bool(&mut self) -> T {
-        match &self.fhe_key {
-            Some(sk) => set_server_key(sk.clone()),
-            None => (),
-        };
+        if let Some(sk) = &self.fhe_key {
+            set_server_key(sk.clone());
+        }

        let [o, a, b, c] = self.get_output_and_values(0);

@@ -196,18 +195,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let mut values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut ret = Vec::<T>::with_capacity(64);
--- a/apps/trivium/src/trivium/trivium_byte.rs
+++ b/apps/trivium/src/trivium/trivium_byte.rs
@@ -187,18 +187,12 @@ where
    /// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
    /// Vec (first value is oldest, last is newest)
    pub fn next_64(&mut self) -> Vec<T> {
-        match &self.fhe_key {
-            Some(sk) => {
-                rayon::broadcast(|_| set_server_key(sk.clone()));
-            }
-            None => (),
+        if let Some(sk) = &self.fhe_key {
+            rayon::broadcast(|_| set_server_key(sk.clone()));
        }
        let values = self.get_64_output_and_values();
-        match &self.fhe_key {
-            Some(_) => {
-                rayon::broadcast(|_| unset_server_key());
-            }
-            None => (),
+        if self.fhe_key.is_some() {
+            rayon::broadcast(|_| unset_server_key());
        }

        let mut bytes = Vec::<T>::with_capacity(8);
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.4.0-alpha.0"
+version = "0.6.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -67,9 +67,21 @@ endif()

 add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

+# Check if the DEBUG flag is defined
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  # Debug mode
+  message("Compiling in Debug mode")
+  add_definitions(-DDEBUG)
+  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
+else()
+  # Release mode
+  message("Compiling in Release mode")
+  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
+endif()
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
+    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
  --use_fast_math -Xcompiler -fPIC")

--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -18,7 +18,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,

 void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
                                 void *lwe_array_out, void *glwe_array_in,
-                                 uint32_t *nth_array, uint32_t num_glwes,
+                                 uint32_t *nth_array, uint32_t num_nths,
                                 uint32_t glwe_dimension,
                                 uint32_t polynomial_size);
 };
--- a/backends/tfhe-cuda-backend/cuda/include/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/compression.h
@@ -0,0 +1,163 @@
+#ifndef CUDA_INTEGER_COMPRESSION_H
+#define CUDA_INTEGER_COMPRESSION_H
+
+#include "integer.h"
+
+extern "C" {
+void scratch_cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory);
+
+void scratch_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory);
+
+void cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr);
+
+void cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
+
+void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
+                                                       uint32_t *gpu_indexes,
+                                                       uint32_t gpu_count,
+                                                       int8_t **mem_ptr_void);
+
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
+                                                         uint32_t *gpu_indexes,
+                                                         uint32_t gpu_count,
+                                                         int8_t **mem_ptr_void);
+}
+
+template <typename Torus> struct int_compression {
+  int_radix_params compression_params;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+
+  uint32_t body_count;
+
+  // Compression
+  int8_t *fp_ks_buffer;
+  Torus *tmp_lwe;
+  Torus *tmp_glwe_array_out;
+
+  int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                  uint32_t gpu_count, int_radix_params compression_params,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;
+    this->storage_log_modulus = storage_log_modulus;
+    this->body_count = num_radix_blocks;
+
+    if (allocate_gpu_memory) {
+      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                    compression_params.polynomial_size;
+
+      tmp_lwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
+              sizeof(Torus),
+          streams[0], gpu_indexes[0]);
+      tmp_glwe_array_out = (Torus *)cuda_malloc_async(
+          lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
+          gpu_indexes[0]);
+
+      scratch_packing_keyswitch_lwe_list_to_glwe_64(
+          streams[0], gpu_indexes[0], &fp_ks_buffer,
+          compression_params.glwe_dimension, compression_params.polynomial_size,
+          num_radix_blocks, true);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
+    cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
+                                               &fp_ks_buffer);
+  }
+};
+
+template <typename Torus> struct int_decompression {
+  int_radix_params encryption_params;
+  int_radix_params compression_params;
+
+  uint32_t storage_log_modulus;
+
+  uint32_t num_radix_blocks;
+  uint32_t body_count;
+
+  Torus *tmp_extracted_glwe;
+  Torus *tmp_extracted_lwe;
+  uint32_t *tmp_indexes_array;
+
+  int_radix_lut<Torus> *carry_extract_lut;
+
+  int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                    uint32_t gpu_count, int_radix_params encryption_params,
+                    int_radix_params compression_params,
+                    uint32_t num_radix_blocks, uint32_t body_count,
+                    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    this->encryption_params = encryption_params;
+    this->compression_params = compression_params;
+    this->storage_log_modulus = storage_log_modulus;
+    this->num_radix_blocks = num_radix_blocks;
+    this->body_count = body_count;
+
+    if (allocate_gpu_memory) {
+      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                    compression_params.polynomial_size;
+      Torus lwe_accumulator_size = (compression_params.glwe_dimension *
+                                        compression_params.polynomial_size +
+                                    1);
+      carry_extract_lut = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, encryption_params, 1,
+          num_radix_blocks, allocate_gpu_memory);
+
+      tmp_extracted_glwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
+          gpu_indexes[0]);
+      tmp_indexes_array = (uint32_t *)cuda_malloc_async(
+          num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0]);
+      tmp_extracted_lwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
+          gpu_indexes[0]);
+
+      // Carry extract LUT
+      auto carry_extract_f = [encryption_params](Torus x) -> Torus {
+        return x / encryption_params.message_modulus;
+      };
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0],
+          carry_extract_lut->get_lut(gpu_indexes[0], 0),
+          encryption_params.glwe_dimension, encryption_params.polynomial_size,
+          encryption_params.message_modulus, encryption_params.carry_modulus,
+          carry_extract_f);
+
+      carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);
+
+    carry_extract_lut->release(streams, gpu_indexes, gpu_count);
+    delete carry_extract_lut;
+  }
+};
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -39,16 +39,15 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

-bool cuda_check_support_cooperative_groups();
-
-bool cuda_check_support_thread_block_clusters();
-
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

 void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);

+void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                            uint32_t gpu_index);
+
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

@@ -62,9 +61,13 @@ void cuda_synchronize_device(uint32_t gpu_index);
 void cuda_drop(void *ptr, uint32_t gpu_index);

 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+}

 int cuda_get_max_shared_memory(uint32_t gpu_index);
-}
+
+bool cuda_check_support_cooperative_groups();
+
+bool cuda_check_support_thread_block_clusters();

 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,7 +8,7 @@ extern std::mutex m;
 extern bool p2p_enabled;

 extern "C" {
-int cuda_setup_multi_gpu();
+int32_t cuda_setup_multi_gpu();
 }

 // Define a variant type that can be either a vector or a single pointer
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -1,6 +1,7 @@
 #ifndef CUDA_INTEGER_H
 #define CUDA_INTEGER_H

+#include "keyswitch.h"
 #include "pbs/programmable_bootstrap.cuh"
 #include "programmable_bootstrap.h"
 #include "programmable_bootstrap_multibit.h"
@@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE {
  LEFT_ROTATE = 2,
  RIGHT_ROTATE = 3
 };
-enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
 enum BITOP_TYPE {
  BITAND = 0,
  BITOR = 1,
@@ -80,6 +80,11 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
                                            uint32_t gpu_count,
                                            int8_t **mem_ptr_void);

+void cuda_apply_many_univariate_lut_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
+    void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride);
+
 void scratch_cuda_full_propagation_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -112,10 +117,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
                               uint32_t gpu_count, int8_t **mem_ptr_void);

-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus);
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus);

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
@@ -385,8 +391,8 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
-    void **bsks, uint32_t num_blocks, uint32_t shift);
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -475,7 +481,8 @@ struct int_radix_params {
        message_modulus(message_modulus), carry_modulus(carry_modulus){};

  void print() {
-    printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
+    printf("pbs_type: %u, glwe_dimension: %u, "
+           "polynomial_size: %u, "
           "big_lwe_dimension: %u, "
           "small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
           "%u, pbs_base_log: "
@@ -812,7 +819,6 @@ template <typename Torus> struct int_radix_lut {
    }
  }
 };
-
 template <typename Torus> struct int_bit_extract_luts_buffer {
  int_radix_params params;
  int_radix_lut<Torus> *lut;
@@ -970,28 +976,52 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
                                                (params.big_lwe_dimension + 1) *
                                                sizeof(Torus),
                                            streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_bits, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
      tmp_shift_bits = (Torus *)cuda_malloc_async(
          max_num_bits_that_tell_shift * num_radix_blocks *
              (params.big_lwe_dimension + 1) * sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_shift_bits, 0,
+                        max_num_bits_that_tell_shift * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);

      tmp_rotated = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_rotated, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);

      tmp_input_bits_a = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_input_bits_a, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
      tmp_input_bits_b = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_input_bits_b, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
      tmp_mux_inputs = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_mux_inputs, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);

      auto mux_lut_f = [](Torus x) -> Torus {
        // x is expected to be x = 0bcba
@@ -1151,6 +1181,11 @@ template <typename Torus> struct int_sc_prop_memory {
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
    step_output = (Torus *)cuda_malloc_async(
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(generates_or_propagates, 0,
+                      num_radix_blocks * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+    cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);

    // declare functions for lut generation
    auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
@@ -1267,6 +1302,11 @@ template <typename Torus> struct int_overflowing_sub_memory {
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
    step_output = (Torus *)cuda_malloc_async(
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(generates_or_propagates, 0,
+                      num_radix_blocks * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+    cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);

    // declare functions for lut generation
    auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
@@ -1356,6 +1396,7 @@ template <typename Torus> struct int_overflowing_sub_memory {

 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
  Torus *new_blocks;
+  Torus *new_blocks_copy;
  Torus *old_blocks;
  Torus *small_lwe_vector;
  int_radix_params params;
@@ -1383,17 +1424,40 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    new_blocks = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
    old_blocks = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
    small_lwe_vector = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks_copy, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(old_blocks, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(small_lwe_vector, 0,
+                      max_pbs_count * (params.small_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);

    d_smart_copy_in = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
    d_smart_copy_out = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
  }

  int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -1414,11 +1478,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->new_blocks = new_blocks;
    this->old_blocks = old_blocks;
    this->small_lwe_vector = small_lwe_vector;
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks_copy, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);

    d_smart_copy_in = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
    d_smart_copy_out = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
  }

  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -1432,8 +1507,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
    }

+    cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
    scp_mem->release(streams, gpu_indexes, gpu_count);
-
    delete scp_mem;
  }
 };
@@ -2087,7 +2162,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {

    if (allocate_gpu_memory) {
      Torus total_modulus = params.message_modulus * params.carry_modulus;
-      uint32_t max_value = total_modulus - 1;
+      uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

      int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
      tmp_block_accumulated = (Torus *)cuda_malloc_async(
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -16,6 +16,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+
+void scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory);
+
+void cuda_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
+
+void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
+                                                uint32_t gpu_index,
+                                                int8_t **fp_ks_buffer);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -69,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -77,18 +77,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);

 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);
-
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count);
-
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count);
 }

 template <typename Torus>
@@ -339,7 +331,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -348,7 +341,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -358,7 +352,8 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -17,8 +17,7 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(

 void scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
@@ -27,7 +26,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
@@ -47,8 +47,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
 template <typename Torus>
 void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
@@ -58,7 +57,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);
 #endif

 template <typename Torus>
@@ -74,13 +74,13 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template <typename Torus>
@@ -90,7 +90,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
@@ -126,7 +127,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_acc_step_two = NULL;
  int8_t *d_mem_acc_cg = NULL;
  int8_t *d_mem_acc_tbc = NULL;
-
+  uint32_t lwe_chunk_size;
  double2 *keybundle_fft;
  Torus *global_accumulator;
  double2 *global_accumulator_fft;
@@ -138,6 +139,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
    this->pbs_variant = pbs_variant;
+    this->lwe_chunk_size = lwe_chunk_size;
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

    // default
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,17 +1,3 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -23,7 +23,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,

 void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
                                 void *lwe_array_out, void *glwe_array_in,
-                                 uint32_t *nth_array, uint32_t num_glwes,
+                                 uint32_t *nth_array, uint32_t num_nths,
                                 uint32_t glwe_dimension,
                                 uint32_t polynomial_size) {

@@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
  case 256:
    host_sample_extract<uint64_t, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
        glwe_dimension);
    break;
  case 512:
    host_sample_extract<uint64_t, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
        glwe_dimension);
    break;
  case 1024:
    host_sample_extract<uint64_t, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
        glwe_dimension);
    break;
  case 2048:
    host_sample_extract<uint64_t, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
        glwe_dimension);
    break;
  case 4096:
    host_sample_extract<uint64_t, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
        glwe_dimension);
    break;
  case 8192:
    host_sample_extract<uint64_t, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
        glwe_dimension);
    break;
  case 16384:
    host_sample_extract<uint64_t, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
+        (uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
        glwe_dimension);
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -38,10 +38,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
  auto lwe_out = lwe_array_out + input_id * lwe_output_size;

  // We assume each GLWE will store the first polynomial_size inputs
-  uint32_t nth_per_glwe = params::degree;
-  auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
+  uint32_t lwe_per_glwe = params::degree;
+  auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;

-  auto nth = nth_array[input_id];
+  // nth is ensured to be in [0, lwe_per_glwe)
+  auto nth = nth_array[input_id] % lwe_per_glwe;

  sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
  sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
@@ -50,11 +51,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
 template <typename Torus, class params>
 __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                  Torus *lwe_array_out, Torus *glwe_array_in,
-                                  uint32_t *nth_array, uint32_t num_glwes,
+                                  uint32_t *nth_array, uint32_t num_nths,
                                  uint32_t glwe_dimension) {
  cudaSetDevice(gpu_index);

-  dim3 grid(num_glwes);
+  dim3 grid(num_nths);
  dim3 thds(params::degree / params::opt);
  sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
      lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_output_indexes),
@@ -48,3 +48,35 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
+
+void scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
+  scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
+      glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
+}
+/* Perform functional packing keyswitch on a batch of 64 bits input LWE
+ * ciphertexts.
+ */
+void cuda_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
+      output_glwe_dimension, output_polynomial_size, base_log, level_count,
+      num_lwes);
+}
+
+void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
+                                                uint32_t gpu_index,
+                                                int8_t **fp_ks_buffer) {
+  cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -7,6 +7,7 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
+#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
 }

 template <typename Torus>
-__host__ void cuda_keyswitch_lwe_ciphertext_vector(
+__host__ void host_keyswitch_lwe_ciphertext_vector(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
    Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
        GET_VARIANT_ELEMENT(lwe_input_indexes, i);

    // Compute Keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+    host_keyswitch_lwe_ciphertext_vector<Torus>(
        streams[i], gpu_indexes[i], current_lwe_array_out,
        current_lwe_output_indexes, current_lwe_array_in,
        current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
@@ -154,4 +155,160 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
  }
 }

+template <typename Torus>
+__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
+  cudaSetDevice(gpu_index);
+
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  if (allocate_gpu_memory)
+    *fp_ks_buffer = (int8_t *)cuda_malloc_async(
+        2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
+        gpu_index);
+}
+
+// public functional packing keyswitch for a single LWE ciphertext
+//
+// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
+// different thread blocks at the x-axis to work on that input.
+template <typename Torus>
+__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+    Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count) {
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t glwe_size = (glwe_dimension + 1);
+
+  if (tid < glwe_size * polynomial_size) {
+    const int local_index = threadIdx.x;
+    // the output_glwe is split in polynomials and each x-block takes one of
+    // them
+    size_t poly_id = blockIdx.x;
+    size_t coef_per_block = blockDim.x;
+
+    // number of coefficients inside fp-ksk block for each lwe_input coefficient
+    size_t ksk_block_size = glwe_size * polynomial_size * level_count;
+
+    // initialize accumulator to 0
+    glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
+                        tid == glwe_dimension * polynomial_size);
+
+    // Iterate through all lwe elements
+    for (int i = 0; i < lwe_dimension_in; i++) {
+      // Round and prepare decomposition
+      Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
+
+      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+      Torus mod_b_mask = (1ll << base_log) - 1ll;
+
+      // block of key for current lwe coefficient (cur_input_lwe[i])
+      auto ksk_block = &fp_ksk[i * ksk_block_size];
+      for (int j = 0; j < level_count; j++) {
+        auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
+        // Iterate through each level and multiply by the ksk piece
+        auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
+        Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
+        glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
+      }
+    }
+  }
+}
+
+// public functional packing keyswitch for a batch of LWE ciphertexts
+//
+// Selects the input each thread is working on using the y-block index.
+//
+// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
+// different thread blocks at the x-axis to work on that input.
+template <typename Torus>
+__global__ void
+packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
+                                   Torus *fp_ksk, uint32_t lwe_dimension_in,
+                                   uint32_t glwe_dimension,
+                                   uint32_t polynomial_size, uint32_t base_log,
+                                   uint32_t level_count, Torus *d_mem) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+  const int lwe_size = (lwe_dimension_in + 1);
+
+  const int input_id = blockIdx.y;
+  const int degree = input_id;
+
+  // Select an input
+  auto lwe_in = lwe_array_in + input_id * lwe_size;
+  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
+  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
+  // KS LWE to GLWE
+  packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
+      polynomial_size, base_log, level_count);
+
+  // P * x ^degree
+  auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
+  auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
+  polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
+                                           tid % polynomial_size,
+                                           polynomial_size, 1, true);
+}
+
+/// To-do: Rewrite this kernel for efficiency
+template <typename Torus>
+__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size, uint32_t num_lwes) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < (glwe_dimension + 1) * polynomial_size) {
+    glwe_out[tid] = glwe_array_in[tid];
+
+    // Accumulate
+    for (int i = 1; i < num_lwes; i++) {
+      auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
+      glwe_out[tid] += glwe_in[tid];
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
+    Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
+    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_lwes) {
+
+  if (num_lwes > polynomial_size)
+    PANIC("Cuda error: too many LWEs to pack. The number of LWEs should be "
+          "smaller than "
+          "polynomial_size.")
+
+  cudaSetDevice(gpu_index);
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
+
+  dim3 grid(num_blocks, num_lwes);
+  dim3 threads(num_threads);
+
+  auto d_mem = (Torus *)fp_ks_buffer;
+  auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
+
+  // individually keyswitch each lwe
+  packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
+      d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
+      glwe_dimension, polynomial_size, base_log, level_count, d_mem);
+  check_cuda_error(cudaGetLastError());
+
+  // accumulate to a single glwe
+  accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
+      glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
+      num_lwes);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -2,6 +2,7 @@
 #define CNCRT_TORUS_CUH

 #include "types/int128.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <limits>

 template <typename T>
@@ -29,20 +30,18 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
 template <typename T>
 __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
                                              uint32_t level_count) {
-  T shift = sizeof(T) * 8 - level_count * base_log;
-  T mask = 1ll << (shift - 1);
-  T b = (x & mask) >> (shift - 1);
+  const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
+  const T shift = non_rep_bit_count - 1;
  T res = x >> shift;
-  res += b;
-  res <<= shift;
-  return res;
+  res += 1;
+  res &= (T)(-2);
+  return res << shift;
 }

 template <typename T>
 __device__ __forceinline__ void modulus_switch(T input, T &output,
                                               uint32_t log_modulus) {
  constexpr uint32_t BITS = sizeof(T) * 8;
-
  output = input + (((T)1) << (BITS - log_modulus - 1));
  output >>= (BITS - log_modulus);
 }
@@ -54,4 +53,27 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
  return output;
 }

+template <typename Torus>
+__global__ void modulus_switch_inplace(Torus *array, int size,
+                                       uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    array[tid] = modulus_switch(array[tid], log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void host_modulus_switch_inplace(cudaStream_t stream,
+                                          uint32_t gpu_index, Torus *array,
+                                          int size, uint32_t log_modulus) {
+  cudaSetDevice(gpu_index);
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
+
+  modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
+                                                                 log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -137,6 +137,30 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
  }
 }

+/// Copy memory within a GPU
+void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                            uint32_t gpu_index) {
+  if (size == 0)
+    return;
+  cudaPointerAttributes attr_dest;
+  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
+  if (attr_dest.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
+  }
+  cudaPointerAttributes attr_src;
+  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
+  if (attr_src.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
+  }
+  check_cuda_error(cudaSetDevice(gpu_index));
+  if (attr_src.device == attr_dest.device) {
+    check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
+  } else {
+    check_cuda_error(
+        cudaMemcpyPeer(dest, attr_dest.device, src, attr_src.device, size));
+  }
+}
+
 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
@@ -177,8 +201,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    int num_blocks = (n + block_size - 1) / block_size;

    // Launch the kernel
-    cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
-                                                                 n);
+    cuda_set_value_kernel<Torus>
+        <<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
    check_cuda_error(cudaGetLastError());
  }
 }
@@ -247,14 +271,5 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
  cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                         gpu_index);
  check_cuda_error(cudaGetLastError());
-#if CUDA_ARCH == 900
-  max_shared_memory = 226000;
-#elif CUDA_ARCH == 890
-  max_shared_memory = 127000;
-#elif CUDA_ARCH == 800
-  max_shared_memory = 163000;
-#elif CUDA_ARCH == 700
-  max_shared_memory = 95000;
-#endif
  return max_shared_memory;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -6,6 +6,7 @@
 #include "twiddles.cuh"
 #include "types/complex/operations.cuh"

+using Index = unsigned;
 /*
 * Direct negacyclic FFT:
 *   - before the FFT the N real coefficients are stored into a
@@ -31,290 +32,81 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
   *  full loop, which should increase performance
   */

-  size_t tid = threadIdx.x;
-  size_t twid_id;
-  size_t i1, i2;
-  double2 u, v, w;
+  __syncthreads();
+  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
+  constexpr Index LOG2_DEGREE = params::log2_degree;
+  constexpr Index HALF_DEGREE = params::degree >> 1;
+  constexpr Index STRIDE = params::degree / params::opt;
+
+  Index tid = threadIdx.x;
+  double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
+
+  // load into registers
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    u[i] = A[tid];
+    v[i] = A[tid + HALF_DEGREE];
+
+    tid += STRIDE;
+  }
+
  // level 1
  // we don't make actual complex multiplication on level1 since we have only
  // one twiddle, it's real and image parts are equal, so we can multiply
  // it with simpler operations
 #pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    i1 = tid;
-    i2 = tid + params::degree / 2;
-
-    u = A[i1];
-    v = A[i2] * (double2){0.707106781186547461715008466854,
-                          0.707106781186547461715008466854};
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    w = v[i] * (double2){0.707106781186547461715008466854,
+                         0.707106781186547461715008466854};
+    v[i] = u[i] - w;
+    u[i] = u[i] + w;
  }
-  __syncthreads();

-  // level 2
-  // from this level there are more than one twiddles and none of them has equal
-  // real and imag parts, so complete complex multiplication is needed
-  // for each level params::degree / 2^level represents number of coefficients
-  // inside divided chunk of specific level
-  //
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 4);
-    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
-    i2 = i1 + params::degree / 4;
+  Index twiddle_shift = 1;
+  for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    twiddle_shift <<= 1;

-    w = negtwiddles[twid_id + 2];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 3
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 8);
-    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
-    i2 = i1 + params::degree / 8;
-
-    w = negtwiddles[twid_id + 4];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 4
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 16);
-    i1 =
-        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
-    i2 = i1 + params::degree / 16;
-
-    w = negtwiddles[twid_id + 8];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 5
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 32);
-    i1 =
-        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
-    i2 = i1 + params::degree / 32;
-
-    w = negtwiddles[twid_id + 16];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 6
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 64);
-    i1 =
-        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
-    i2 = i1 + params::degree / 64;
-
-    w = negtwiddles[twid_id + 32];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 7
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 128);
-    i1 = 2 * (params::degree / 128) * twid_id +
-         (tid & (params::degree / 128 - 1));
-    i2 = i1 + params::degree / 128;
-
-    w = negtwiddles[twid_id + 64];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // from level 8, we need to check size of params degree, because we support
-  // minimum actual polynomial size = 256,  when compressed size is halfed and
-  // minimum supported compressed size is 128, so we always need first 7
-  // levels of butterfly operation, since butterfly levels are hardcoded
-  // we need to check if polynomial size is big enough to require specific level
-  // of butterfly.
-  if constexpr (params::degree >= 256) {
-    // level 8
    tid = threadIdx.x;
+    __syncthreads();
 #pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 256);
-      i1 = 2 * (params::degree / 256) * twid_id +
-           (tid & (params::degree / 256 - 1));
-      i2 = i1 + params::degree / 256;
-
-      w = negtwiddles[twid_id + 128];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      A[tid] = (u_stays_in_register) ? v[i] : u[i];
+      tid = tid + STRIDE;
    }
    __syncthreads();
-  }

-  if constexpr (params::degree >= 512) {
-    // level 9
    tid = threadIdx.x;
 #pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 512);
-      i1 = 2 * (params::degree / 512) * twid_id +
-           (tid & (params::degree / 512 - 1));
-      i2 = i1 + params::degree / 512;
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      w = A[tid ^ lane_mask];
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+      w = negtwiddles[tid / lane_mask + twiddle_shift];

-      w = negtwiddles[twid_id + 256];
-      u = A[i1];
-      v = A[i2] * w;
+      w *= v[i];

-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
+      v[i] = u[i] - w;
+      u[i] = u[i] + w;
+      tid = tid + STRIDE;
    }
-    __syncthreads();
  }
+  __syncthreads();

-  if constexpr (params::degree >= 1024) {
-    // level 10
-    tid = threadIdx.x;
+  // store registers in SM
+  tid = threadIdx.x;
 #pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 1024);
-      i1 = 2 * (params::degree / 1024) * twid_id +
-           (tid & (params::degree / 1024 - 1));
-      i2 = i1 + params::degree / 1024;
-
-      w = negtwiddles[twid_id + 512];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 2048) {
-    // level 11
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 2048);
-      i1 = 2 * (params::degree / 2048) * twid_id +
-           (tid & (params::degree / 2048 - 1));
-      i2 = i1 + params::degree / 2048;
-
-      w = negtwiddles[twid_id + 1024];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 4096) {
-    // level 12
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 4096);
-      i1 = 2 * (params::degree / 4096) * twid_id +
-           (tid & (params::degree / 4096 - 1));
-      i2 = i1 + params::degree / 4096;
-
-      w = negtwiddles[twid_id + 2048];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 8192) {
-    // level 13
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 8192);
-      i1 = 2 * (params::degree / 8192) * twid_id +
-           (tid & (params::degree / 8192 - 1));
-      i2 = i1 + params::degree / 8192;
-
-      w = negtwiddles[twid_id + 4096];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
+  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+    A[tid * 2] = u[i];
+    A[tid * 2 + 1] = v[i];
+    tid = tid + STRIDE;
  }
+  __syncthreads();
 }

 /*
@@ -329,284 +121,82 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
   *  full loop, which should increase performance
   */

+  __syncthreads();
+  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
+  constexpr Index LOG2_DEGREE = params::log2_degree;
+  constexpr Index DEGREE = params::degree;
+  constexpr Index HALF_DEGREE = params::degree >> 1;
+  constexpr Index STRIDE = params::degree / params::opt;
+
  size_t tid = threadIdx.x;
-  size_t twid_id;
-  size_t i1, i2;
-  double2 u, w;
+  double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;

-  // divide input by compressed polynomial size
-  tid = threadIdx.x;
-  for (size_t i = 0; i < params::opt; ++i) {
-    A[tid] /= params::degree;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // none of the twiddles have equal real and imag part, so
-  // complete complex multiplication has to be done
-  // here we have more than one twiddle
-  // mapping in backward fft is reversed
-  // butterfly operation is started from last level
-
-  if constexpr (params::degree >= 8192) {
-    // level 13
-    tid = threadIdx.x;
+  // load into registers and divide by compressed polynomial size
 #pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 8192);
-      i1 = 2 * (params::degree / 8192) * twid_id +
-           (tid & (params::degree / 8192 - 1));
-      i2 = i1 + params::degree / 8192;
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    u[i] = A[2 * tid];
+    v[i] = A[2 * tid + 1];

-      w = negtwiddles[twid_id + 4096];
-      u = A[i1] - A[i2];
+    u[i] /= DEGREE;
+    v[i] /= DEGREE;

-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
+    tid += STRIDE;
+  }

-      tid += params::degree / params::opt;
+  Index twiddle_shift = DEGREE;
+  for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    tid = threadIdx.x;
+    twiddle_shift >>= 1;
+
+    // at this point registers are ready for the  butterfly
+    tid = threadIdx.x;
+    __syncthreads();
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      w = (u[i] - v[i]);
+      u[i] += v[i];
+      v[i] = w * conjugate(negtwiddles[tid / lane_mask + twiddle_shift]);
+
+      // keep one of the register for next iteration and store another one in sm
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      A[tid] = (u_stays_in_register) ? v[i] : u[i];
+
+      tid = tid + STRIDE;
    }
    __syncthreads();
-  }

-  if constexpr (params::degree >= 4096) {
-    // level 12
+    // prepare registers for next butterfly iteration
    tid = threadIdx.x;
 #pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 4096);
-      i1 = 2 * (params::degree / 4096) * twid_id +
-           (tid & (params::degree / 4096 - 1));
-      i2 = i1 + params::degree / 4096;
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      w = A[tid ^ lane_mask];
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];

-      w = negtwiddles[twid_id + 2048];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
+      tid = tid + STRIDE;
    }
-    __syncthreads();
  }

-  if constexpr (params::degree >= 2048) {
-    // level 11
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 2048);
-      i1 = 2 * (params::degree / 2048) * twid_id +
-           (tid & (params::degree / 2048 - 1));
-      i2 = i1 + params::degree / 2048;
-
-      w = negtwiddles[twid_id + 1024];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 1024) {
-    // level 10
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 1024);
-      i1 = 2 * (params::degree / 1024) * twid_id +
-           (tid & (params::degree / 1024 - 1));
-      i2 = i1 + params::degree / 1024;
-
-      w = negtwiddles[twid_id + 512];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 512) {
-    // level 9
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 512);
-      i1 = 2 * (params::degree / 512) * twid_id +
-           (tid & (params::degree / 512 - 1));
-      i2 = i1 + params::degree / 512;
-
-      w = negtwiddles[twid_id + 256];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 256) {
-    // level 8
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 256);
-      i1 = 2 * (params::degree / 256) * twid_id +
-           (tid & (params::degree / 256 - 1));
-      i2 = i1 + params::degree / 256;
-
-      w = negtwiddles[twid_id + 128];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  // below level 8, we don't need to check size of params degree, because we
-  // support minimum actual polynomial size = 256,  when compressed size is
-  // halfed and minimum supported compressed size is 128, so we always need
-  // last 7 levels of butterfly operation, since butterfly levels are hardcoded
-  // we don't need to check if polynomial size is big enough to require
-  // specific level of butterfly.
-  // level 7
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 128);
-    i1 = 2 * (params::degree / 128) * twid_id +
-         (tid & (params::degree / 128 - 1));
-    i2 = i1 + params::degree / 128;
-
-    w = negtwiddles[twid_id + 64];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
+  // last iteration
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    w = (u[i] - v[i]);
+    u[i] = u[i] + v[i];
+    v[i] = w * (double2){0.707106781186547461715008466854,
+                         -0.707106781186547461715008466854};
  }
  __syncthreads();
-
-  // level 6
+  // store registers in SM
  tid = threadIdx.x;
 #pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 64);
-    i1 =
-        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
-    i2 = i1 + params::degree / 64;
-
-    w = negtwiddles[twid_id + 32];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 5
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 32);
-    i1 =
-        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
-    i2 = i1 + params::degree / 32;
-
-    w = negtwiddles[twid_id + 16];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 4
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 16);
-    i1 =
-        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
-    i2 = i1 + params::degree / 16;
-
-    w = negtwiddles[twid_id + 8];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 3
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 8);
-    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
-    i2 = i1 + params::degree / 8;
-
-    w = negtwiddles[twid_id + 4];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 2
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 4);
-    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
-    i2 = i1 + params::degree / 4;
-
-    w = negtwiddles[twid_id + 2];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 1
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 2);
-    i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
-    i2 = i1 + params::degree / 2;
-
-    w = negtwiddles[twid_id + 1];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
+  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+    A[tid] = u[i];
+    A[tid + HALF_DEGREE] = v[i];
+    tid = tid + STRIDE;
  }
  __syncthreads();
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -37,12 +37,12 @@ void host_resolve_signed_overflow(
      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
      mem->params.big_lwe_dimension, 1);

-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
-                1);
-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, last_block_input_carry,
-                mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, x,
+                       mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, last_block_input_carry,
+                       mem->params.big_lwe_dimension, 1);

  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
                                      last_block_inner_propagation,
@@ -94,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(

  // phase 1
  if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
+                         big_lwe_dimension, num_blocks);
  } else {
-    host_integer_radix_negation(
+    host_integer_radix_negation<Torus>(
        streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
+                         big_lwe_dimension, num_blocks);
  }

  // phase 2
@@ -109,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }

-  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
-                              result, output_carry, input_carries,
-                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation(
+  host_propagate_single_carry<Torus>(
+      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
+      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
@@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
  // phase 3
  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];

-  host_resolve_signed_overflow(
+  host_resolve_signed_overflow<Torus>(
      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };

-  scratch_cuda_integer_radix_cmux_kb(
+  scratch_cuda_integer_radix_cmux_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;

-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
-        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
-        params.message_modulus, 1);
+    device_pack_bivariate_blocks<Torus>
+        <<<num_blocks, num_threads, 0, streams[0]>>>(
+            lwe_array_out_block, predicate->lwe_indexes_in,
+            lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
+            params.big_lwe_dimension, params.message_modulus, 1);
    check_cuda_error(cudaGetLastError());
  }

@@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb(
  }

  auto mem_true = mem_ptr->zero_if_true_buffer;
-  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-              lwe_array_true, lwe_condition, mem_true,
-              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                     lwe_array_true, lwe_condition, mem_true,
+                     mem_ptr->inverted_predicate_lut, bsks, ksks,
+                     num_radix_blocks);
  auto mem_false = mem_ptr->zero_if_false_buffer;
-  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
-              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
-              bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
+                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
+                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
+                     num_radix_blocks);
  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
  }
@@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
-                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
-                num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
+                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
+                       params.big_lwe_dimension, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  // Add all blocks and store in sum
-  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
      output, input, lwe_dimension, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }
@@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -75,7 +74,7 @@ __host__ void are_all_comparisons_block_true(
  auto tmp_out = are_all_block_true_buffer->tmp_out;

  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = total_modulus - 1;
+  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);

  cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
                               num_radix_blocks * (big_lwe_dimension + 1) *
@@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true(
    auto is_equal_to_num_blocks_map =
        &are_all_block_true_buffer->is_equal_to_lut_map;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -121,9 +121,8 @@ __host__ void are_all_comparisons_block_true(
            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                     max_value, num_radix_blocks, true);

-        auto is_equal_to_num_blocks_lut_f = [max_value,
-                                             chunk_length](Torus x) -> Torus {
-          return (x & max_value) == chunk_length;
+        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
+          return x == chunk_length;
        };
        generate_device_accumulator<Torus>(
            streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
@@ -165,7 +164,6 @@ __host__ void is_at_least_one_comparisons_block_true(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -174,7 +172,7 @@ __host__ void is_at_least_one_comparisons_block_true(
  auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;

  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = total_modulus - 1;
+  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);

  cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
                               num_radix_blocks * (big_lwe_dimension + 1) *
@@ -192,8 +190,9 @@ __host__ void is_at_least_one_comparisons_block_true(
    auto input_blocks = mem_ptr->tmp_lwe_array_out;
    auto accumulator = buffer->tmp_block_accumulated;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -280,8 +279,8 @@ __host__ void host_compare_with_zero_equality(
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

-      accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
-                            big_lwe_dimension, chunk_size);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
+                                   big_lwe_dimension, chunk_size);

      num_sum_blocks++;
      remainder_blocks -= (chunk_size - 1);
@@ -295,8 +294,9 @@ __host__ void host_compare_with_zero_equality(
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
      zero_comparison);
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 sum, mem_ptr, bsks, ksks, num_sum_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, sum, mem_ptr, bsks, ksks,
+                                        num_sum_blocks);
 }

 template <typename Torus>
@@ -310,7 +310,7 @@ __host__ void host_integer_radix_equality_check_kb(

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  integer_radix_apply_bivariate_lookup_table_kb(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
      eq_buffer->operator_lut->params.message_modulus);
@@ -319,9 +319,9 @@ __host__ void host_integer_radix_equality_check_kb(
  //
  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 comparisons, mem_ptr, bsks, ksks,
-                                 num_radix_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, comparisons, mem_ptr,
+                                        bsks, ksks, num_radix_blocks);
 }

 template <typename Torus>
@@ -352,19 +352,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
-                   lwe_array_right, big_lwe_dimension, num_radix_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
+                          lwe_array_left, lwe_array_right, big_lwe_dimension,
+                          num_radix_blocks);

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
      num_radix_blocks, is_non_zero_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
+  host_integer_radix_add_scalar_one_inplace<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
      num_radix_blocks, message_modulus, carry_modulus);
 }
@@ -406,8 +407,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,

  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -433,8 +434,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
  std::function<Torus(Torus)> f;

  if (partial_block_count == 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);

    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
      int msb = (x >> 2) & 3;
@@ -454,9 +455,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                 gpu_count, lwe_array_out, y,
-                                                 bsks, ksks, 1, last_lut);
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
+      last_lut);
 }

 template <typename Torus>
@@ -488,19 +489,21 @@ __host__ void host_integer_radix_difference_check_kb(
    if (mem_ptr->is_signed) {
      packed_num_radix_blocks -= 2;
    }
-    pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
+                       big_lwe_dimension, packed_num_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
+                       lwe_array_right, big_lwe_dimension,
+                       packed_num_radix_blocks, message_modulus);
    // From this point we have half number of blocks
    packed_num_radix_blocks /= 2;

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
        packed_num_radix_blocks, identity_lut);
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
        packed_num_radix_blocks, identity_lut);

@@ -517,16 +520,17 @@ __host__ void host_integer_radix_difference_check_kb(
  if (!mem_ptr->is_signed) {
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
-    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                            rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
+    compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
+                                   lhs, rhs, mem_ptr, bsks, ksks,
+                                   packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                              rhs, mem_ptr, bsks, ksks,
-                              packed_num_radix_blocks);
+      compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                     comparisons, lhs, rhs, mem_ptr, bsks, ksks,
+                                     packed_num_radix_blocks);

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
@@ -535,21 +539,21 @@ __host__ void host_integer_radix_difference_check_kb(
      Torus *last_right_block_before_sign_block =
          diff_buffer->tmp_packed_right +
          packed_num_radix_blocks * big_lwe_size;
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
          1, identity_lut);
-      compare_radix_blocks_kb(
+      compare_radix_blocks_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + packed_num_radix_blocks * big_lwe_size,
          last_left_block_before_sign_block, last_right_block_before_sign_block,
          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -558,11 +562,11 @@ __host__ void host_integer_radix_difference_check_kb(
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                              lwe_array_left, lwe_array_right, mem_ptr, bsks,
-                              ksks, num_radix_blocks - 1);
+      compare_radix_blocks_kb<Torus>(
+          streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
+          lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -575,9 +579,9 @@ __host__ void host_integer_radix_difference_check_kb(
  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
-  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                      comparisons, mem_ptr->diff_buffer->tree_buffer,
-                      reduction_lut_f, bsks, ksks, num_comparisons);
+  tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             comparisons, mem_ptr->diff_buffer->tree_buffer,
+                             reduction_lut_f, bsks, ksks, num_comparisons);
 }

 template <typename Torus>
@@ -601,16 +605,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                             Torus **ksks, uint32_t total_num_radix_blocks) {

  // Compute the sign
-  host_integer_radix_difference_check_kb(
+  host_integer_radix_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
      ksks, total_num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -0,0 +1,89 @@
+#include "compression.cuh"
+
+void scratch_cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory) {
+
+  int_radix_params compression_params(
+      pbs_type, compression_glwe_dimension, compression_polynomial_size,
+      (compression_glwe_dimension + 1) * compression_polynomial_size,
+      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
+      carry_modulus);
+
+  scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
+      compression_params, lwe_per_glwe, storage_log_modulus,
+      allocate_gpu_memory);
+}
+void scratch_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory) {
+
+  // Decompression doesn't keyswitch, so big and small dimensions are the same
+  int_radix_params encryption_params(
+      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
+      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
+      message_modulus, carry_modulus);
+
+  int_radix_params compression_params(
+      pbs_type, compression_glwe_dimension, compression_polynomial_size,
+      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
+      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
+
+  scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
+      encryption_params, compression_params, storage_log_modulus,
+      allocate_gpu_memory);
+}
+void cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr) {
+
+  host_integer_compress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
+      (int_compression<uint64_t> *)mem_ptr);
+}
+void cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
+
+  host_integer_decompress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
+      indexes_array, indexes_array_size, bsks,
+      (int_decompression<uint64_t> *)mem_ptr);
+}
+
+void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
+                                                       uint32_t *gpu_indexes,
+                                                       uint32_t gpu_count,
+                                                       int8_t **mem_ptr_void) {
+
+  int_compression<uint64_t> *mem_ptr =
+      (int_compression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_decompression<uint64_t> *mem_ptr =
+      (int_decompression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -0,0 +1,380 @@
+#ifndef CUDA_INTEGER_COMPRESSION_CUH
+#define CUDA_INTEGER_COMPRESSION_CUH
+
+#include "ciphertext.h"
+#include "compression.h"
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer/integer.cuh"
+#include "linearalgebra/multiplication.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
+                     uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
+  auto nbits = sizeof(Torus) * 8;
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto glwe_index = tid / out_len;
+  auto i = tid % out_len;
+  auto chunk_array_in = array_in + glwe_index * in_len;
+  auto chunk_array_out = array_out + glwe_index * out_len;
+
+  if (tid < num_coeffs) {
+
+    auto k = nbits * i / log_modulus;
+    auto j = k;
+
+    auto start_shift = i * nbits - j * log_modulus;
+
+    auto value = chunk_array_in[j] >> start_shift;
+    j++;
+
+    while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
+      auto shift = j * log_modulus - i * nbits;
+      value |= chunk_array_in[j] << shift;
+      j++;
+    }
+
+    chunk_array_out[i] = value;
+  }
+}
+
+template <typename Torus>
+__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
+                        Torus *array_out, Torus *array_in, uint32_t num_glwes,
+                        uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
+  if (array_in == array_out)
+    PANIC("Cuda error: Input and output must be different");
+
+  cudaSetDevice(gpu_index);
+  auto compression_params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+  // [0..num_glwes-1) GLWEs
+  auto in_len = (compression_params.glwe_dimension + 1) *
+                compression_params.polynomial_size;
+  auto number_bits_to_pack = in_len * log_modulus;
+  auto nbits = sizeof(Torus) * 8;
+  // number_bits_to_pack.div_ceil(Scalar::BITS)
+  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
+
+  // Last GLWE
+  number_bits_to_pack = in_len * log_modulus;
+  auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
+
+  auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);
+
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
+                                            num_coeffs, in_len, out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__host__ void host_integer_compress(cudaStream_t *streams,
+                                    uint32_t *gpu_indexes, uint32_t gpu_count,
+                                    Torus *glwe_array_out, Torus *lwe_array_in,
+                                    Torus **fp_ksk, uint32_t num_radix_blocks,
+                                    int_compression<Torus> *mem_ptr) {
+
+  auto compression_params = mem_ptr->compression_params;
+  auto input_lwe_dimension = compression_params.small_lwe_dimension;
+
+  // Shift
+  auto lwe_shifted = mem_ptr->tmp_lwe;
+  host_cleartext_multiplication<Torus>(
+      streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
+      (uint64_t)compression_params.message_modulus, input_lwe_dimension,
+      num_radix_blocks);
+
+  uint32_t lwe_in_size = input_lwe_dimension + 1;
+  uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
+                           compression_params.polynomial_size;
+  uint32_t num_glwes_for_compression =
+      num_radix_blocks / mem_ptr->lwe_per_glwe + 1;
+
+  // Keyswitch LWEs to GLWE
+  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
+  cuda_memset_async(tmp_glwe_array_out, 0,
+                    num_glwes_for_compression *
+                        (compression_params.glwe_dimension + 1) *
+                        compression_params.polynomial_size * sizeof(Torus),
+                    streams[0], gpu_indexes[0]);
+  auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
+  auto rem_lwes = num_radix_blocks;
+
+  auto lwe_subset = lwe_shifted;
+  auto glwe_out = tmp_glwe_array_out;
+  while (rem_lwes > 0) {
+    auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
+
+    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+        compression_params.polynomial_size, compression_params.ks_base_log,
+        compression_params.ks_level, chunk_size);
+
+    rem_lwes -= chunk_size;
+    lwe_subset += chunk_size * lwe_in_size;
+    glwe_out += glwe_out_size;
+  }
+
+  // Modulus switch
+  host_modulus_switch_inplace<Torus>(
+      streams[0], gpu_indexes[0], tmp_glwe_array_out,
+      num_glwes_for_compression * (compression_params.glwe_dimension + 1) *
+          compression_params.polynomial_size,
+      mem_ptr->storage_log_modulus);
+
+  host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
+                   tmp_glwe_array_out, num_glwes_for_compression,
+                   num_radix_blocks, mem_ptr);
+}
+
+template <typename Torus>
+__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
+                        uint32_t log_modulus, uint32_t input_len,
+                        uint32_t initial_out_len) {
+  auto nbits = sizeof(Torus) * 8;
+
+  auto i = threadIdx.x + blockIdx.x * blockDim.x;
+  auto chunk_array_in = array_in + index * input_len;
+  if (i < initial_out_len) {
+    // Unpack
+    Torus mask = ((Torus)1 << log_modulus) - 1;
+    auto start = i * log_modulus;
+    auto end = (i + 1) * log_modulus;
+
+    auto start_block = start / nbits;
+    auto start_remainder = start % nbits;
+
+    auto end_block_inclusive = (end - 1) / nbits;
+
+    Torus unpacked_i;
+    if (start_block == end_block_inclusive) {
+      auto single_part = chunk_array_in[start_block] >> start_remainder;
+      unpacked_i = single_part & mask;
+    } else {
+      auto first_part = chunk_array_in[start_block] >> start_remainder;
+      auto second_part = chunk_array_in[start_block + 1]
+                         << (nbits - start_remainder);
+
+      unpacked_i = (first_part | second_part) & mask;
+    }
+
+    // Extract
+    glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
+  }
+}
+
+/// Extracts the glwe_index-nth GLWE ciphertext
+template <typename Torus>
+__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
+                           Torus *glwe_array_out, Torus *array_in,
+                           uint32_t glwe_index,
+                           int_decompression<Torus> *mem_ptr) {
+  if (array_in == glwe_array_out)
+    PANIC("Cuda error: Input and output must be different");
+
+  cudaSetDevice(gpu_index);
+
+  auto compression_params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+
+  uint32_t body_count =
+      std::min(mem_ptr->body_count, compression_params.polynomial_size);
+  auto initial_out_len =
+      compression_params.glwe_dimension * compression_params.polynomial_size +
+      body_count;
+
+  auto compressed_glwe_accumulator_size =
+      (compression_params.glwe_dimension + 1) *
+      compression_params.polynomial_size;
+  auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
+  auto nbits = sizeof(Torus) * 8;
+  // number_bits_to_unpack.div_ceil(Scalar::BITS)
+  auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+
+  // We assure the tail of the glwe is zeroed
+  auto zeroed_slice = glwe_array_out + initial_out_len;
+  cuda_memset_async(zeroed_slice, 0,
+                    (compression_params.polynomial_size - body_count) *
+                        sizeof(Torus),
+                    stream, gpu_index);
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, array_in,
+                                               glwe_index, log_modulus,
+                                               input_len, initial_out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__host__ void
+host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *d_lwe_array_out,
+                        Torus *d_packed_glwe_in, uint32_t *h_indexes_array,
+                        uint32_t indexes_array_size, void **d_bsks,
+                        int_decompression<Torus> *h_mem_ptr) {
+
+  auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
+  cuda_memcpy_async_to_gpu(d_indexes_array, h_indexes_array,
+                           indexes_array_size * sizeof(uint32_t), streams[0],
+                           gpu_indexes[0]);
+
+  auto compression_params = h_mem_ptr->compression_params;
+  auto lwe_per_glwe = compression_params.polynomial_size;
+  if (indexes_array_size > lwe_per_glwe)
+    PANIC("Cuda error: too many LWEs to decompress. The number of LWEs should "
+          "be smaller than "
+          "polynomial_size.")
+
+  auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
+  if (num_radix_blocks != indexes_array_size)
+    PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
+          "should be the same as indexes_array_size.")
+
+  // the first element is the last index in h_indexes_array that lies in the
+  // related GLWE
+  std::vector<std::pair<int, Torus *>> glwe_vec;
+
+  // Extract all GLWEs
+  Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                compression_params.polynomial_size;
+
+  auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
+  auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
+  host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
+                      d_packed_glwe_in, current_glwe_index, h_mem_ptr);
+  glwe_vec.push_back(std::make_pair(0, extracted_glwe));
+  for (int i = 1; i < indexes_array_size; i++) {
+    auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
+    if (glwe_index != current_glwe_index) {
+      extracted_glwe += glwe_accumulator_size;
+      current_glwe_index = glwe_index;
+      // Extracts a new GLWE
+      host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
+                          d_packed_glwe_in, glwe_index, h_mem_ptr);
+      glwe_vec.push_back(std::make_pair(i, extracted_glwe));
+    } else {
+      // Updates the index
+      glwe_vec.back().first++;
+    }
+  }
+  // Sample extract all LWEs
+  Torus lwe_accumulator_size = compression_params.small_lwe_dimension + 1;
+
+  auto extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
+  uint32_t current_idx = 0;
+  auto d_indexes_array_chunk = d_indexes_array;
+  for (const auto &max_idx_and_glwe : glwe_vec) {
+    uint32_t last_idx = max_idx_and_glwe.first;
+    extracted_glwe = max_idx_and_glwe.second;
+
+    auto num_lwes = last_idx + 1 - current_idx;
+    cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
+                                extracted_glwe, d_indexes_array_chunk, num_lwes,
+                                compression_params.glwe_dimension,
+                                compression_params.polynomial_size);
+    d_indexes_array_chunk += num_lwes;
+    extracted_lwe += num_lwes * lwe_accumulator_size;
+    current_idx = last_idx;
+  }
+
+  // Reset
+  extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
+
+  // In the case of extracting a single LWE these parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  auto encryption_params = h_mem_ptr->encryption_params;
+  auto lut = h_mem_ptr->carry_extract_lut;
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
+        lut->lwe_indexes_in, d_bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
+  } else {
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        compression_params.small_lwe_dimension + 1);
+
+    /// Apply PBS
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
+        lwe_after_pbs_vec, lut->h_lwe_indexes_out,
+        lut->using_trivial_lwe_indexes, num_radix_blocks,
+        encryption_params.big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_compress_integer_radix_ciphertext(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_compression<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params compression_params, uint32_t lwe_per_glwe,
+    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_compression<Torus>(
+      streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_decompression<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    uint32_t body_count, int_radix_params encryption_params,
+    int_radix_params compression_params, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_decompression<Torus>(
+      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
+      num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory);
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -282,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          // Shift the mask so that we will only keep bits we should
          uint32_t shifted_mask = full_message_mask >> shift_amount;

-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
              interesting_divisor.last_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_1[shifted_mask]);
@@ -310,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          // the estimated degree of the output is < msg_modulus
          shifted_mask = shifted_mask & full_message_mask;

-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
              divisor_ms_blocks.first_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_2[shifted_mask]);
@@ -334,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          interesting_remainder1.insert(0, numerator_block_1.first_block(),
                                        streams[0], gpu_indexes[0]);

-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
              mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);

@@ -342,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                               interesting_remainder1.len - 1, streams[0],
                               gpu_indexes[0]);

-          host_radix_blocks_rotate_left(
+          host_radix_blocks_rotate_left<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder1.data,
              tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);

@@ -363,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

    auto left_shift_interesting_remainder2 =
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
              mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
        }; // left_shift_interesting_remainder2
@@ -396,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    // but in that position, interesting_remainder2 always has a 0
    auto &merged_interesting_remainder = interesting_remainder1;

-    host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
-                  merged_interesting_remainder.data,
-                  interesting_remainder2.data, radix_params.big_lwe_dimension,
-                  merged_interesting_remainder.len);
+    host_addition<Torus>(
+        streams[0], gpu_indexes[0], merged_interesting_remainder.data,
+        merged_interesting_remainder.data, interesting_remainder2.data,
+        radix_params.big_lwe_dimension, merged_interesting_remainder.len);

    // after create_clean_version_of_merged_remainder
    // `merged_interesting_remainder` will be reused as
@@ -439,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        // We could call unchecked_scalar_ne
        // But we are in the special case where scalar == 0
        // So we can skip some stuff
-        host_compare_with_zero_equality(
+        host_compare_with_zero_equality<Torus>(
            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
            mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
@@ -447,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        tmp_1.len =
            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);

-        is_at_least_one_comparisons_block_true(
+        is_at_least_one_comparisons_block_true<Torus>(
            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
            mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
@@ -460,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    //  `cleaned_merged_interesting_remainder` - radix ciphertext
    auto create_clean_version_of_merged_remainder =
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -486,10 +486,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
-                  subtraction_overflowed.data,
-                  at_least_one_upper_block_is_non_zero.data,
-                  radix_params.big_lwe_dimension, 1);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
+                         subtraction_overflowed.data,
+                         at_least_one_upper_block_is_non_zero.data,
+                         radix_params.big_lwe_dimension, 1);

    int factor = (i) ? 3 : 2;
    int factor_lut_id = factor - 2;
@@ -528,10 +528,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);

-      host_addition(streams[0], gpu_indexes[0],
-                    &quotient[block_of_bit * big_lwe_size],
-                    &quotient[block_of_bit * big_lwe_size],
-                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
+      host_addition<Torus>(
+          streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
+          &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
+          radix_params.big_lwe_dimension, 1);
    };

    for (uint j = 0; j < gpu_count; j++) {
@@ -564,17 +564,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
-  host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
-                remainder2.data, radix_params.big_lwe_dimension,
-                remainder1.len);
+  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                       remainder2.data, radix_params.big_lwe_dimension,
+                       remainder1.len);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, num_blocks, mem_ptr->message_extract_lut_2);
  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -53,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus);

-  scratch_cuda_propagate_single_carry_kb_inplace(
+  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
@@ -131,6 +131,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

+void cuda_apply_many_univariate_lut_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
+    void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
+
+  host_apply_many_univariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
+      lut_count, lut_stride);
+}
+
 void scratch_cuda_apply_bivariate_lut_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -195,15 +208,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
-    void **bsks, uint32_t num_blocks, uint32_t shift) {
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {

  int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;

  host_compute_prefix_sum_hillis_steele<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
-      static_cast<uint64_t *>(input_radix_lwe), params,
+      static_cast<uint64_t *>(generates_or_propagates), params,
      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      num_blocks);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -95,7 +95,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -124,8 +124,8 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                  uint32_t lwe_size) {
  cudaSetDevice(gpu_indexes[0]);
  int num_blocks = blocks_count / 2, num_threads = 1024;
-  radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
-      src, blocks_count, lwe_size);
+  radix_blocks_reverse_lwe_inplace<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
 }

 // polynomial_size threads
@@ -164,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-      lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
-      lwe_dimension, shift, num_radix_blocks);
+  device_pack_bivariate_blocks<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(
+          lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
+          lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }

@@ -188,6 +189,93 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                                   lwe_trivial_indexes_vec[0], lwe_array_in,
+                                   lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                                   small_lwe_dimension, ks_base_log, ks_level,
+                                   num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
+  } else {
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1);
+
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                                   lwe_array_in_vec, lwe_trivial_indexes_vec,
+                                   ksks, big_lwe_dimension, small_lwe_dimension,
+                                   ks_base_log, ks_level, num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        lut_stride);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t lut_count,
+    uint32_t lut_stride) {
+  // apply_lookup_table
+  auto params = lut->params;
+  auto pbs_type = params.pbs_type;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
+
  /// For multi GPU execution we create vectors of pointers for inputs and
  /// outputs
  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
@@ -210,7 +298,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type);
+        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -236,7 +324,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -271,12 +360,16 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
+
  // Left message is shifted
  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
-  pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
-                        lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
-                        lut->lwe_indexes_in, big_lwe_dimension, shift,
-                        num_radix_blocks);
+  pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
+                               lwe_array_pbs_in, lut->lwe_trivial_indexes,
+                               lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
+                               big_lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());

  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -301,7 +394,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type);
+        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
  } else {
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    multi_gpu_scatter_lwe_async<Torus>(
@@ -323,7 +416,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -380,7 +474,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
    body[i] = -body[i];
  }

-  rotate_left(body, half_box_size, polynomial_size);
+  rotate_left<Torus>(body, half_box_size, polynomial_size);
 }

 template <typename Torus>
@@ -442,7 +536,6 @@ void generate_device_accumulator_bivariate(
                                         message_modulus, carry_modulus, f);

  // copy host lut and lut_indexes_vec to device
-  cuda_synchronize_stream(stream, gpu_index);
  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                           (glwe_dimension + 1) * polynomial_size *
                               sizeof(Torus),
@@ -508,7 +601,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                               message_modulus, carry_modulus, f);

-  cuda_synchronize_stream(stream, gpu_index);
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -590,13 +682,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
-  host_compute_prefix_sum_hillis_steele(
+  host_compute_prefix_sum_hillis_steele<Torus>(
      streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
      params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
  if (carry_out != nullptr) {
    cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
                                 streams[0], gpu_indexes[0]);
@@ -610,8 +702,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 gpu_indexes[0]);
  }

-  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
-                glwe_dimension * polynomial_size, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                       step_output, glwe_dimension * polynomial_size,
+                       num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -664,14 +757,15 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
      overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
      big_lwe_size_bytes, streams[0], gpu_indexes[0]);

-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                    gpu_indexes[0]);

-  host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                   step_output, glwe_dimension * polynomial_size, num_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                          step_output, glwe_dimension * polynomial_size,
+                          num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -697,6 +791,9 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
  int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
  int small_lwe_size = (params.small_lwe_dimension + 1);

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (int i = 0; i < num_blocks; i++) {
    auto cur_input_block = &input_blocks[i * big_lwe_size];

@@ -719,7 +816,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
        mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
        params.glwe_dimension, params.small_lwe_dimension,
        params.polynomial_size, params.pbs_base_log, params.pbs_level,
-        params.grouping_factor, 2, params.pbs_type);
+        params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);

    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
                                 big_lwe_size * sizeof(Torus), streams[0],
@@ -727,10 +824,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,

    if (i < num_blocks - 1) {
      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition(streams[0], gpu_indexes[0], next_input_block,
-                    next_input_block,
-                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                    params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
+                           next_input_block,
+                           &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
+                           params.big_lwe_dimension, 1);
    }
  }
 }
@@ -765,7 +862,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
    }

    if (num_radix_blocks % 2 == 1) {
-      // We couldn't pack the last block, so we just copy it
+      // We couldn't host_pack the last block, so we just copy it
      Torus *lsb_block =
          lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
      Torus *last_block =
@@ -794,7 +891,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
-  device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
 }

@@ -840,7 +937,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_create_trivial_radix<<<grid, thds, 0, stream>>>(
+  device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }
@@ -857,7 +954,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
                             uint32_t num_radix_blocks, uint32_t bits_per_block,
                             int_bit_extract_luts_buffer<Torus> *bit_extract) {

-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
      num_radix_blocks * bits_per_block, bit_extract->lut);
 }
@@ -870,7 +967,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
             std::function<Torus(Torus)> sign_handler_f, void **bsks,
             Torus **ksks, uint32_t num_sign_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
@@ -904,9 +1000,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    while (num_sign_blocks > 2) {
-      pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
-                  big_lwe_dimension, num_sign_blocks, 4);
-      integer_radix_apply_univariate_lookup_table_kb(
+      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                         big_lwe_dimension, num_sign_blocks, 4);
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
          num_sign_blocks / 2, lut);

@@ -937,11 +1033,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
-                2, 4);
-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_b, bsks, ksks, 1, lut);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                       big_lwe_dimension, 2, 4);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
+        1, lut);

  } else {

@@ -957,9 +1053,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_a, bsks, ksks, 1, lut);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
+        1, lut);
  }
 }

@@ -992,6 +1088,18 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
      num_blocks, mem);
 }

+template <typename Torus>
+void host_apply_many_univariate_lut_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut<Torus> *mem,
+    Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count,
+    uint32_t lut_stride) {
+
+  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
+      num_blocks, mem, lut_count, lut_stride);
+}
+
 template <typename Torus>
 void scratch_cuda_apply_bivariate_lut_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -241,7 +241,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 1024:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -249,7 +250,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 2048:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -257,7 +259,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 4096:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -265,7 +268,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 8192:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -273,7 +277,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 16384:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -281,7 +286,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -186,9 +186,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
    uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
-    int_radix_lut<Torus> *reused_lut = nullptr) {
+    int_radix_lut<Torus> *reused_lut) {

  auto new_blocks = mem_ptr->new_blocks;
+  auto new_blocks_copy = mem_ptr->new_blocks_copy;
  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;

@@ -205,12 +206,31 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
  auto small_lwe_size = small_lwe_dimension + 1;

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
+
+  if (num_radix_in_vec == 0)
+    return;
+  if (num_radix_in_vec == 1) {
+    cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
+                                 num_blocks_in_radix * big_lwe_size *
+                                     sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    return;
+  }
  if (old_blocks != terms) {
    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
                                 num_blocks_in_radix * num_radix_in_vec *
                                     big_lwe_size * sizeof(Torus),
                                 streams[0], gpu_indexes[0]);
  }
+  if (num_radix_in_vec == 2) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks[num_blocks * big_lwe_size],
+                         big_lwe_dimension, num_blocks);
+    return;
+  }

  size_t r = num_radix_in_vec;
  size_t total_modulus = message_modulus * carry_modulus;
@@ -271,7 +291,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    if (!ch_amount)
      ch_amount++;
    dim3 add_grid(ch_amount, num_blocks, 1);
-    size_t sm_size = big_lwe_size * sizeof(Torus);

    cudaSetDevice(gpu_indexes[0]);
    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
@@ -288,7 +307,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
        terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
        total_count, message_count, carry_count, sm_copy_count);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
@@ -303,8 +321,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    // inside d_smart_copy_in there are only -1 values
    // it's fine to call smart_copy with same pointer
    // as source and destination
-    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
-        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
+    cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
+                                 r * num_blocks * big_lwe_size * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
+        new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
    check_cuda_error(cudaGetLastError());

@@ -347,7 +368,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type);
+          mem_ptr->params.pbs_type, lut_count, lut_stride);
    } else {
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);

@@ -395,7 +416,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type);
+          mem_ptr->params.pbs_type, lut_count, lut_stride);

      multi_gpu_gather_lwe_async<Torus>(
          streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
@@ -422,9 +443,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  luts_message_carry->release(streams, gpu_indexes, gpu_count);
  delete (luts_message_carry);

-  host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
-                num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks[num_blocks * big_lwe_size],
+                       big_lwe_dimension, num_blocks);
 }

 template <typename Torus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -1,14 +1,16 @@
 #include "integer/negation.cuh"

-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {

-  host_integer_radix_negation(
+  host_integer_radix_negation<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
-      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }

 void scratch_cuda_integer_radix_overflowing_sub_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -25,14 +25,13 @@ template <typename Torus>
 __global__ void
 device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
                              uint64_t lwe_dimension, uint64_t message_modulus,
-                              uint64_t carry_modulus, uint64_t delta) {
+                              uint64_t delta) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < lwe_dimension + 1) {
    bool is_body = (tid == lwe_dimension);

    // z = ceil( degree / 2^p ) * 2^p
    uint64_t z = (2 * message_modulus - 1) / message_modulus;
-    __syncthreads();
    z *= message_modulus;

    // (0,Delta*z) - ct
@@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,

      uint64_t encoded_zb = zb * delta;

-      __syncthreads();
-
      // (0,Delta*z) - ct
      output[tid] =
          (is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
-      __syncthreads();
    }
  }
 }
@@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);
-  uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);

  // Value of the shift we multiply our messages by
  // If message_modulus and carry_modulus are always powers of 2 we can simplify
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
+  device_integer_radix_negation<<<grid, thds, 0, streams[0]>>>(
      output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
-      carry_modulus, delta);
+      delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb(

  auto radix_params = mem_ptr->params;

-  host_unchecked_sub_with_correcting_term(
+  host_unchecked_sub_with_correcting_term<Torus>(
      streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
      radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
      radix_params.message_modulus, radix_params.carry_modulus,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {

-  host_integer_radix_scalar_addition_inplace(
+  host_integer_radix_scalar_addition_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -18,10 +18,8 @@ __global__ void device_integer_radix_scalar_addition_inplace(

  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < num_blocks) {
-    Torus scalar = scalar_input[tid];
-    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
-
-    *body += scalar * delta;
+    lwe_array[tid * (lwe_dimension + 1) + lwe_dimension] +=
+        scalar_input[tid] * delta;
  }
 }

@@ -45,9 +43,10 @@ __host__ void host_integer_radix_scalar_addition_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_addition_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -83,8 +82,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
+  device_integer_radix_add_scalar_one_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
+                                      lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -122,10 +122,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
-                                                    streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_subtraction_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
  check_cuda_error(cudaGetLastError());
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -3,6 +3,58 @@

 #include "integer/comparison.cuh"

+template <typename Torus>
+__host__ void scalar_compare_radix_blocks_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {
+
+  if (num_radix_blocks == 0)
+    return;
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
+  // be set to 1
+  // meaning that the output of the pbs will be the negative (modulo message
+  // space)
+  //
+  // Example:
+  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
+  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
+  // Since there was an overflow the bit of padding is 1 and not 0.
+  // When applying the LUT for an input value of 14 we would expect 1,
+  // but since the bit of padding is 1, we will get -1 modulus our message
+  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
+
+  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
+  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
+  // Subtract
+  // Here we need the true lwe sub, not the one that comes from shortint.
+  host_integer_radix_scalar_subtraction_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
+      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
+
+  // Apply LUT to compare to 0
+  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
+      ksks, num_radix_blocks, sign_lut);
+
+  // Add one
+  // Here Lhs can have the following values: (-1) % (message modulus * carry
+  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
+  host_integer_radix_add_scalar_one_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
+}
+
 template <typename Torus>
 __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -45,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
  if (total_num_scalar_blocks == 0) {
    // We only have to compare blocks with zero
    // means scalar is zero
-    host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
-                                    mem_ptr->tmp_lwe_array_out, lwe_array_in,
-                                    mem_ptr, bsks, ksks, total_num_radix_blocks,
-                                    mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+        lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks,
+        mem_ptr->is_zero_lut);

    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -91,10 +143,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -106,22 +159,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
    //////////////
    // msb
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
@@ -145,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -159,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -173,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 1 if lhs == rhs
    // - 2 if lhs > rhs
    auto comparisons = mem_ptr->tmp_lwe_array_out;
-    scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                                   lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                               comparisons, mem_ptr->diff_buffer->tree_buffer,
+                               sign_handler_f, bsks, ksks,
+                               num_lsb_radix_blocks);
  }
 }

@@ -229,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // We only have to compare blocks with zero
    // means scalar is zero
    Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
-    host_compare_with_zero_equality(
+    host_compare_with_zero_equality<Torus>(
        streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
        mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
    Torus *sign_block =
@@ -277,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
        sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -304,10 +359,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -319,24 +375,24 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
    //////////////
    // msb
    // We remove the last block (which is the sign)
    Torus *are_all_msb_zeros = lwe_array_msb_out;
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);

    auto sign_bit_pos = (int)log2(message_modulus) - 1;

@@ -371,7 +427,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
        are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
        signed_msb_lut->params.message_modulus);
@@ -382,8 +438,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    //////////////
    // Reduce the two blocks into one final
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        2);

  } else {
    // We only have to do the regular comparison
@@ -403,10 +460,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_lsb_radix_blocks - 1, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks - 1,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_lsb_radix_blocks - 1, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -415,19 +473,19 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // - 0 if lhs < rhs
    // - 1 if lhs == rhs
    // - 2 if lhs > rhs
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
-                                   ksks, num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          lwe_array_ct_out, lhs, rhs, mem_ptr,
+                                          bsks, ksks, num_lsb_radix_blocks);
    Torus *encrypted_sign_block =
        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
    Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);

    auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-    create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
-                         scalar_sign_block, big_lwe_dimension, 1, 1,
-                         message_modulus, carry_modulus);
+    create_trivial_radix<Torus>(
+        msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
+        big_lwe_dimension, 1, 1, message_modulus, carry_modulus);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
        encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
        mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
@@ -439,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
-                 num_lsb_radix_blocks + 1);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        num_lsb_radix_blocks + 1);
  }
 }

@@ -452,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  // Calculates the difference sign between the ciphertext and the scalar
  // - 0 if lhs < rhs
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
-  integer_radix_signed_scalar_difference_check_kb(
+  integer_radix_signed_scalar_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);
@@ -469,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             sign, lwe_array_left, lwe_array_right,
-                             mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                    lwe_array_out, sign, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -492,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
-    integer_radix_signed_scalar_difference_check_kb(
+    integer_radix_signed_scalar_difference_check_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
  } else {
-    integer_radix_unsigned_scalar_difference_check_kb(
+    integer_radix_unsigned_scalar_difference_check_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
@@ -513,70 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
-    integer_radix_signed_scalar_maxmin_kb(
+    integer_radix_signed_scalar_maxmin_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
        total_num_scalar_blocks);
  } else {
-    integer_radix_unsigned_scalar_maxmin_kb(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
-        total_num_scalar_blocks);
+    PANIC("Cuda error: only signed scalar maxmin can be called in signed "
+          "scalar comparison")
  }
 }

-template <typename Torus>
-__host__ void scalar_compare_radix_blocks_kb(
-    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
-    uint32_t num_radix_blocks) {
-
-  if (num_radix_blocks == 0)
-    return;
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
-  // be set to 1
-  // meaning that the output of the pbs will be the negative (modulo message
-  // space)
-  //
-  // Example:
-  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
-  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
-  // Since there was an overflow the bit of padding is 1 and not 0.
-  // When applying the LUT for an input value of 14 we would expect 1,
-  // but since the bit of padding is 1, we will get -1 modulus our message
-  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
-
-  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
-  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
-                               num_radix_blocks * (big_lwe_dimension + 1) *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-  // Subtract
-  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_integer_radix_scalar_subtraction_inplace(
-      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
-      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
-
-  // Apply LUT to compare to 0
-  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
-      ksks, num_radix_blocks, sign_lut);
-
-  // Add one
-  // Here Lhs can have the following values: (-1) % (message modulus * carry
-  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
-      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
-      num_radix_blocks, message_modulus, carry_modulus);
-}
-
 template <typename Torus>
 __host__ void host_integer_radix_scalar_maxmin_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -591,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
-  host_integer_radix_scalar_difference_check_kb(
+  host_integer_radix_scalar_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);
@@ -601,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -659,10 +662,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
    auto packed_scalar =
        packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
+                       scalar_blocks, 0, num_scalar_blocks, message_modulus);

    cuda_memcpy_async_gpu_to_gpu(
        scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
@@ -670,7 +674,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
        gpu_indexes[0]);
    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
        bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
  }
@@ -689,9 +693,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
      PANIC("Cuda error: integer operation not supported")
    }

-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, msb_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, msb_lut);
  }

  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
@@ -701,13 +705,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(

  switch (mem_ptr->op) {
  case COMPARISON_TYPE::EQ:
-    are_all_comparisons_block_true(
+    are_all_comparisons_block_true<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  case COMPARISON_TYPE::NE:
-    is_at_least_one_comparisons_block_true(
+    is_at_least_one_comparisons_block_true<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -65,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix(
      cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
                                   lwe_size_bytes * num_radix_blocks,
                                   streams[0], gpu_indexes[0]);
-      host_integer_radix_logical_scalar_shift_kb_inplace(
+      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
          streams, gpu_indexes, gpu_count, ptr, shift_amount,
          mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
    } else {
@@ -82,15 +82,16 @@ __host__ void host_integer_scalar_mul_radix(
          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
      T *block_shift_buffer =
          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     block_shift_buffer, preshifted_radix_ct,
-                                     i / msg_bits, num_radix_blocks, lwe_size);
+      host_radix_blocks_rotate_right<T>(
+          streams, gpu_indexes, gpu_count, block_shift_buffer,
+          preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
      // create trivial assign for value = 0
      cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
                        streams[0], gpu_indexes[0]);
      j++;
    }
  }
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);

  cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
  mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
@@ -108,7 +109,7 @@ __host__ void host_integer_scalar_mul_radix(
    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j);
+        num_radix_blocks, j, nullptr);

    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -70,9 +70,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   giver_blocks, lwe_array, 1, num_blocks,
-                                   big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          giver_blocks, lwe_array, 1,
+                                          num_blocks, big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

@@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

  } else {
    // rotate left as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -97,8 +97,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
-                                  lwe_array, 1, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         giver_blocks, lwe_array, 1, num_blocks,
+                                         big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    // create trivial assign for value = 0
    cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  } else {
    // right shift
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    // rotate left as the blocks are from LSB to MSB
    // create trivial assign for value = 0
@@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  Torus *last_block_copy = &padding_block[big_lwe_size];

  if (mem->shift_type == RIGHT_SHIFT) {
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);
    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
                                 gpu_indexes[0]);
@@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      }
      auto lut_univariate_padding_block =
          mem->lut_buffers_univariate[num_bits_in_block - 1];
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
          last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
      // Replace blocks 'pulled' from the left with the correct padding
@@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      if (shift_within_block != 0) {
        auto lut_univariate_shift_last_block =
            mem->lut_buffers_univariate[shift_within_block - 1];
-        integer_radix_apply_univariate_lookup_table_kb(
+        integer_radix_apply_univariate_lookup_table_kb<Torus>(
            mem->local_streams_2, gpu_indexes, gpu_count, last_block,
            last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
      }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -88,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    switch (mem->shift_type) {
    case LEFT_SHIFT:
      // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

      if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
        for (int i = 0; i < rotations; i++)
@@ -103,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
      break;
    case RIGHT_SHIFT:
      // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

      if (mem->is_signed)
        for (int i = 0; i < rotations; i++)
@@ -119,38 +119,37 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
      break;
    case LEFT_ROTATE:
      // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    case RIGHT_ROTATE:
      // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    default:
      PANIC("Unknown operation")
    }

-    // pack bits into one block so that we have
+    // host_pack bits into one block so that we have
    // control_bit|b|a
-    cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
-                      streams[0], gpu_indexes[0]); // Do we need this?
-    pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
-                          mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
-                          mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
-                          total_nb_bits);
+    pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
+                                 mux_lut->lwe_indexes_out, rotated_input,
+                                 input_bits_a, mux_lut->lwe_indexes_in,
+                                 big_lwe_dimension, 2, total_nb_bits);

    // The shift bit is already properly aligned/positioned
    for (int i = 0; i < total_nb_bits; i++)
-      host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
-                    mux_inputs + i * big_lwe_size, shift_bit,
-                    mem->params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0],
+                           mux_inputs + i * big_lwe_size,
+                           mux_inputs + i * big_lwe_size, shift_bit,
+                           mem->params.big_lwe_dimension, 1);

    // we have
    // control_bit|b|a
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
        total_nb_bits, mux_lut);
  }
@@ -179,8 +178,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    auto bit_to_add = input_bits_a + i * big_lwe_size;

    for (int j = 0; j < num_radix_blocks; j++) {
-      host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
-                    big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
+                           big_lwe_dimension, 1);

      block += big_lwe_size;
      bit_to_add += bits_per_block * big_lwe_size;
@@ -188,7 +187,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
        num_radix_blocks, cleaning_lut);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in_1),
-                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in_1),
+                          static_cast<uint32_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
@@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in_1),
-                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in_1),
+                          static_cast<uint64_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
 /*
 * Perform the addition of a u32 input LWE ciphertext vector with a u32
@@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint32_t *>(lwe_array_out),
-                          static_cast<uint32_t *>(lwe_array_in),
-                          static_cast<uint32_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint32_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
 * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
@@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint64_t *>(lwe_array_out),
-                          static_cast<uint64_t *>(lwe_array_in),
-                          static_cast<uint64_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -40,10 +40,10 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
-                               (lwe_dimension + 1) * lwe_ciphertext_count,
-                               stream, gpu_index);
-  plaintext_addition<<<grid, thds, 0, stream>>>(
+  cuda_memcpy_async_gpu_to_gpu(
+      output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
+      stream, gpu_index);
+  plaintext_addition<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  addition<T><<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  subtraction<T>
+      <<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                   (input_lwe_dimension + 1) * sizeof(T),
                               stream, gpu_index);

-  radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
+  radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
      output, plaintext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term(
  }
 }
 template <typename T>
-
 __host__ void host_unchecked_sub_with_correcting_term(
    cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
@@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
+  unchecked_sub_with_correcting_term<T><<<grid, thds, 0, stream>>>(
      output, input_1, input_2, num_entries, lwe_size, message_modulus,
      carry_modulus, degree);
  check_cuda_error(cudaGetLastError());
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint32_t *>(lwe_array_out),
-                                static_cast<uint32_t *>(lwe_array_in),
-                                static_cast<uint32_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication<uint32_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
 * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint64_t *>(lwe_array_out),
-                                static_cast<uint64_t *>(lwe_array_in),
-                                static_cast<uint64_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -14,9 +14,10 @@
 #include <vector>

 template <typename T>
-__global__ void
-cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
-                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
+                                             T *cleartext_input,
+                                             uint32_t input_lwe_dimension,
+                                             uint32_t num_entries) {

  int tid = threadIdx.x;
  int index = blockIdx.x * blockDim.x + tid;
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
  }
 }

+template <typename T>
+__host__ void
+host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
+                                  T *output, T *lwe_input, T *cleartext_input,
+                                  uint32_t input_lwe_dimension,
+                                  uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
+      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void
+cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
+                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] * cleartext_input;
+  }
+}
+
 template <typename T>
 __host__ void
 host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
-                              T *output, T *lwe_input, T *cleartext_input,
+                              T *output, T *lwe_input, T cleartext_input,
                              uint32_t input_lwe_dimension,
                              uint32_t input_lwe_ciphertext_count) {

@@ -45,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  cleartext_multiplication<<<grid, thds, 0, stream>>>(
+  cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
@@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
@@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
+  negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -24,8 +24,8 @@ __device__ const T *get_ith_mask_kth_block(const T *ptr, int i, int k,
                                           uint32_t level_count) {
  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
+              (level_count - level - 1) * polynomial_size / 2 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1)];
 }

@@ -35,8 +35,8 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                     int glwe_dimension, uint32_t level_count) {
  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
+              (level_count - level - 1) * polynomial_size / 2 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1)];
 }
 template <typename T>
@@ -45,8 +45,8 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                     int glwe_dimension, uint32_t level_count) {
  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
+              (level_count - level - 1) * polynomial_size / 2 *
+                  (glwe_dimension + 1) * (glwe_dimension + 1) +
              k * polynomial_size / 2 * (glwe_dimension + 1) +
              glwe_dimension * polynomial_size / 2];
 }
@@ -116,12 +116,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
  switch (polynomial_size) {
  case 256:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                buffer);
@@ -134,12 +128,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    break;
  case 512:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                buffer);
@@ -152,12 +140,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    break;
  case 1024:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                buffer);
@@ -170,12 +152,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    break;
  case 2048:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                buffer);
@@ -188,12 +164,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    break;
  case 4096:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                buffer);
@@ -206,12 +176,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    break;
  case 8192:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                buffer);
@@ -224,12 +188,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    break;
  case 16384:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                buffer);
@@ -270,14 +228,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  case 256:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                output, buffer);
@@ -291,14 +241,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  case 512:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                output, buffer);
@@ -312,14 +254,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  case 1024:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                output, buffer);
@@ -333,14 +267,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  case 2048:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                output, buffer);
@@ -354,14 +280,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  case 4096:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                output, buffer);
@@ -375,14 +293,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  case 8192:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                output, buffer);
@@ -396,14 +306,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
  case 16384:
    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
                           FULLSM>
          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -127,7 +127,8 @@ void execute_pbs_async(
    std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) {
+    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, uint32_t lut_count,
+    uint32_t lut_stride) {
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
@@ -159,7 +160,8 @@ void execute_pbs_async(
            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu,
+            lut_count, lut_stride);
      }
      break;
    default:
@@ -198,7 +200,7 @@ void execute_pbs_async(
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
            polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu);
+            num_inputs_on_gpu, lut_count, lut_stride);
      }
      break;
    case CLASSICAL:
@@ -226,7 +228,8 @@ void execute_pbs_async(
            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu,
+            lut_count, lut_stride);
      }
      break;
    default:
@@ -268,9 +271,8 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
      if (grouping_factor == 0)
        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
      scratch_cuda_multi_bit_programmable_bootstrap_64(
-          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, grouping_factor,
-          input_lwe_ciphertext_count, allocate_gpu_memory);
+          stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
      break;
    case CLASSICAL:
      scratch_cuda_programmable_bootstrap_64(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -1,15 +1,5 @@
 #include "programmable_bootstrap_amortized.cuh"

-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count) {
-  return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
-      glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
-}
-
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the amortized PBS on 32 bits inputs, into `buffer`. It also
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -258,28 +258,6 @@ __host__ void scratch_programmable_bootstrap_amortized(
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  uint64_t full_sm =
-      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
-          polynomial_size, glwe_dimension);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
-          polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
-    cudaFuncSetAttribute(
-        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm);
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
-    check_cuda_error(cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared));
-  }
  if (allocate_gpu_memory) {
    uint64_t buffer_size =
        get_buffer_size_programmable_bootstrap_amortized<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -44,7 +44,8 @@ __global__ void device_programmable_bootstrap_cg(
    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block) {
+    uint64_t device_memory_size_per_block, uint32_t lut_count,
+    uint32_t lut_stride) {

  grid_group grid = this_grid();

@@ -151,8 +152,38 @@ __global__ void device_programmable_bootstrap_cg(
    // we do the computation at block 0 to avoid waiting for extra blocks, in
    // case they're not synchronized
    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    if (lut_count > 1) {
+      for (int i = 1; i < lut_count; i++) {
+        auto next_lwe_array_out =
+            lwe_array_out +
+            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+        auto next_block_lwe_array_out =
+            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                    (glwe_dimension * polynomial_size + 1) +
+                                blockIdx.y * polynomial_size];
+
+        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                           accumulator, glwe_dimension,
+                                           i * lut_stride);
+      }
+    }
  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    if (lut_count > 1) {
+      for (int i = 1; i < lut_count; i++) {
+
+        auto next_lwe_array_out =
+            lwe_array_out +
+            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+        auto next_block_lwe_array_out =
+            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                    (glwe_dimension * polynomial_size + 1) +
+                                blockIdx.y * polynomial_size];
+
+        sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                           accumulator, 0, i * lut_stride);
+      }
+    }
  }
 }

@@ -163,30 +194,6 @@ __host__ void scratch_programmable_bootstrap_cg(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  uint64_t full_sm =
-      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
-          polynomial_size);
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
@@ -202,7 +209,8 @@ __host__ void host_programmable_bootstrap_cg(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t lut_count, uint32_t lut_stride) {

  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
@@ -226,7 +234,7 @@ __host__ void host_programmable_bootstrap_cg(
  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);

-  void *kernel_args[14];
+  void *kernel_args[16];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
@@ -240,6 +248,8 @@ __host__ void host_programmable_bootstrap_cg(
  kernel_args[10] = &base_log;
  kernel_args[11] = &level_count;
  kernel_args[12] = &d_mem;
+  kernel_args[14] = &lut_count;
+  kernel_args[15] = &lut_stride;

  if (max_shared_memory < partial_sm) {
    kernel_args[13] = &full_dm;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -30,7 +30,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
-        int8_t *device_mem, uint64_t device_memory_size_per_block) {
+        int8_t *device_mem, uint64_t device_memory_size_per_block,
+        uint32_t lut_count, uint32_t lut_stride) {

  grid_group grid = this_grid();

@@ -129,9 +130,44 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
+      // Always extract one by default
      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, glwe_dimension,
+                                             i * lut_stride);
+        }
+      }
+
    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
+      }
    }
  } else {
    // Load the accumulator calculated in previous iterations
@@ -177,69 +213,6 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
-          polynomial_size);
-  uint64_t full_sm_cg_accumulate =
-      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
-          polynomial_size);
-  uint64_t partial_sm_cg_accumulate =
-      get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
-          polynomial_size);
-
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-  if (max_shared_memory < full_sm_keybundle) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
-  if (max_shared_memory < partial_sm_cg_accumulate) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              NOSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              NOSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory < full_sm_cg_accumulate) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
-                                                              FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
      gpu_index, input_lwe_ciphertext_count, polynomial_size);
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
@@ -256,8 +229,9 @@ __host__ void execute_cg_external_product_loop(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {

+  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint64_t full_dm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
          polynomial_size);
@@ -275,13 +249,15 @@ __host__ void execute_cg_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;

  auto d_mem = buffer->d_mem_acc_cg;
  auto keybundle_fft = buffer->keybundle_fft;
  auto global_accumulator = buffer->global_accumulator;
  auto buffer_fft = buffer->global_accumulator_fft;

-  void *kernel_args[20];
+  void *kernel_args[22];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
@@ -301,6 +277,8 @@ __host__ void execute_cg_external_product_loop(
  kernel_args[16] = &chunk_size;
  kernel_args[17] = &keybundle_size_per_input;
  kernel_args[18] = &d_mem;
+  kernel_args[20] = &lut_count;
+  kernel_args[21] = &lut_stride;

  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
  dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -333,10 +311,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, num_samples, polynomial_size);
+  auto lwe_chunk_size = buffer->lwe_chunk_size;

  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {
@@ -345,14 +323,15 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
    execute_compute_keybundle<Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
+        grouping_factor, level_count, lwe_offset);

    // Accumulate
    execute_cg_external_product_loop<Torus, params>(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_offset, lut_count,
+        lut_stride);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -122,7 +122,8 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  switch (polynomial_size) {
  case 256:
@@ -130,49 +131,56 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 512:
    host_programmable_bootstrap_tbc<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 1024:
    host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 2048:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 4096:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 8192:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 16384:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -182,25 +190,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 }
 #endif

-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count) {
-
-  if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count))
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-  else
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-}
-
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
@@ -389,7 +378,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  switch (polynomial_size) {
  case 256:
@@ -397,49 +387,56 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 512:
    host_programmable_bootstrap_cg<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 1024:
    host_programmable_bootstrap_cg<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 2048:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 4096:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 8192:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 16384:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -455,7 +452,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  switch (polynomial_size) {
  case 256:
@@ -463,49 +461,56 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 512:
    host_programmable_bootstrap<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 1024:
    host_programmable_bootstrap<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 2048:
    host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 4096:
    host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 8192:
    host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 16384:
    host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -522,7 +527,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 32)
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
@@ -542,7 +547,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
 #else
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -556,7 +562,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -567,7 +574,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -641,7 +649,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
  if (base_log > 64)
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
          "in the ciphertext representation (64)");
@@ -660,7 +668,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
 #else
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -674,7 +683,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case PBS_VARIANT::DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -685,7 +695,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -713,7 +724,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -722,7 +734,8 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
    void *stream, uint32_t gpu_index,
@@ -742,7 +755,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
    void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -751,7 +765,8 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
    void *stream, uint32_t gpu_index,
@@ -779,7 +794,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t *lwe_output_indexes, uint64_t *lut_vector,
@@ -787,7 +803,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);
 template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -141,7 +141,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        Torus *global_accumulator, double2 *global_accumulator_fft,
        uint32_t lwe_iteration, uint32_t lwe_dimension,
        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-        int8_t *device_mem, uint64_t device_memory_size_per_block) {
+        int8_t *device_mem, uint64_t device_memory_size_per_block,
+        uint32_t lut_count, uint32_t lut_stride) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -216,8 +217,38 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, glwe_dimension,
+                                             i * lut_stride);
+        }
+      }
    } else if (blockIdx.y == glwe_dimension) {
      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
+      }
    }
  } else {
    // Persist the updated accumulator
@@ -278,55 +309,6 @@ __host__ void scratch_programmable_bootstrap(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

-  uint64_t full_sm_step_one =
-      get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
-          polynomial_size);
-  uint64_t full_sm_step_two =
-      get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
-          polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
-
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-
-  // Configure step one
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
-  // Configure step two
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
@@ -375,16 +357,15 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
 }

 template <typename Torus, class params>
-__host__ void
-execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, Torus *lut_vector,
-                 Torus *lut_vector_indexes, double2 *bootstrapping_key,
-                 Torus *global_accumulator, double2 *global_accumulator_fft,
-                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-                 int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
-                 uint64_t full_sm, uint64_t full_dm) {
+__host__ void execute_step_two(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    double2 *bootstrapping_key, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration,
+    uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
+    uint64_t full_dm, uint32_t lut_count, uint32_t lut_stride) {

  int max_shared_memory = cuda_get_max_shared_memory(0);
  cudaSetDevice(gpu_index);
@@ -397,21 +378,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm);
+            level_count, d_mem, full_dm, lut_count, lut_stride);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm);
+            level_count, d_mem, partial_dm, lut_count, lut_stride);
  } else {
    device_programmable_bootstrap_step_two<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0);
+            level_count, d_mem, 0, lut_count, lut_stride);
  }
  check_cuda_error(cudaGetLastError());
 }
@@ -425,7 +406,8 @@ __host__ void host_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t lut_count, uint32_t lut_stride) {
  cudaSetDevice(gpu_index);

  // With SM each block corresponds to either the mask or body, no need to
@@ -461,7 +443,8 @@ __host__ void host_programmable_bootstrap(
        lut_vector_indexes, bootstrapping_key, global_accumulator,
        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
+        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
+        lut_count, lut_stride);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -65,7 +65,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -78,7 +79,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 512:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -86,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 1024:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -94,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 2048:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -102,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 4096:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -110,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 8192:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -118,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 16384:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -126,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -142,7 +143,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -155,7 +157,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 512:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -163,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 1024:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -171,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 2048:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -179,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 4096:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -187,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 8192:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -195,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 16384:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -203,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -218,7 +220,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -235,7 +238,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
 #else
    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -250,7 +253,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case PBS_VARIANT::DEFAULT:
    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -262,7 +265,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -328,52 +331,51 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 512:
    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 1024:
    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 2048:
    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 4096:
    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 8192:
    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 16384:
    scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -383,10 +385,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
 }

 void scratch_cuda_multi_bit_programmable_bootstrap_64(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    bool allocate_gpu_memory) {
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

 #if (CUDA_ARCH >= 900)
  if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -394,8 +395,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
          level_count))
    scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
-        lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
 #endif
      if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
@@ -408,8 +409,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
  else
    scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
-        lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
 }

 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
@@ -440,6 +441,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,

  int max_blocks_per_sm;
  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
  if (max_shared_memory < full_sm_keybundle)
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_blocks_per_sm,
@@ -486,10 +488,9 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,

 template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index,
-    pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    bool allocate_gpu_memory);
+    pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void
 cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -499,7 +500,8 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index,
@@ -515,7 +517,8 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -526,52 +529,51 @@ has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
 template <typename Torus>
 void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  switch (polynomial_size) {
  case 256:
    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 512:
    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 1024:
    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 2048:
    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 4096:
    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 8192:
    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  case 16384:
    scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -586,7 +588,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -599,7 +602,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 512:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -607,7 +610,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 1024:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -615,23 +618,37 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
-  case 2048:
-    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+  case 2048: {
+    int num_sms = 0;
+    check_cuda_error(cudaDeviceGetAttribute(
+        &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+    if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
+      host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
+          static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+          lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+          lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, grouping_factor, base_log,
+          level_count, num_samples, lut_count, lut_stride);
+    else
+      host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
+          static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+          lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+          lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, grouping_factor, base_log,
+          level_count, num_samples, lut_count, lut_stride);
+
    break;
+  }
  case 4096:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 8192:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -639,7 +656,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 16384:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -647,7 +664,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -658,8 +675,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(

 template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);

 template void
@@ -670,5 +686,6 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,9 +18,9 @@
 #include <vector>

 template <typename Torus, class params>
-__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
-                                            uint32_t ggsw_idx,
-                                            uint32_t grouping_factor) {
+__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
+                                               uint32_t ggsw_idx,
+                                               uint32_t grouping_factor) {
  Torus x = 0;
  for (int i = 0; i < grouping_factor; i++) {
    uint32_t mask_position = grouping_factor - (i + 1);
@@ -31,6 +31,13 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
  return modulus_switch(x, params::log2_degree + 1);
 }

+__device__ __forceinline__ int
+get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
+                          uint32_t level_count) {
+  return polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) *
+         level_count;
+}
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *__restrict__ lwe_array_in,
@@ -60,8 +67,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
  uint32_t input_idx = blockIdx.x / lwe_chunk_size;

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {
-    //
-    Torus *accumulator = (Torus *)selected_memory;

    const Torus *block_lwe_array_in =
        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
@@ -81,56 +86,52 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-    const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+    const Torus *bsk_poly_ini = bsk_slice + poly_id * params::degree;

-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        bsk_poly, accumulator);
+    Torus reg_acc[params::opt];
+
+    copy_polynomial_in_regs<Torus, params::opt, params::degree / params::opt>(
+        bsk_poly_ini, reg_acc);
+
+    int offset =
+        get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
+
+    // Precalculate the monomial degrees and store them in shared memory
+    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
+    if (threadIdx.x < (1 << grouping_factor)) {
+      const Torus *lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, threadIdx.x, grouping_factor);
+    }
+    synchronize_threads_in_block();

    // Accumulate the other terms
    for (int g = 1; g < (1 << grouping_factor); g++) {

-      const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
-          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-      const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+      uint32_t monomial_degree = monomial_degrees[g];

-      // Calculates the monomial degree
-      const Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, g, grouping_factor);
-
-      synchronize_threads_in_block();
+      const Torus *bsk_poly = bsk_poly_ini + g * offset;
      // Multiply by the bsk element
-      polynomial_product_accumulate_by_monomial<Torus, params>(
-          accumulator, bsk_poly, monomial_degree, false);
+      polynomial_product_accumulate_by_monomial_nosync<Torus, params>(
+          reg_acc, bsk_poly, monomial_degree);
    }
+    synchronize_threads_in_block(); // needed because we are going to reuse the
+                                    // shared memory for the fft

-    synchronize_threads_in_block();
-
-    // Move accumulator to local memory
-    double2 temp[params::opt / 2];
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
-      temp[i].y =
-          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
-      temp[i].x /= (double)std::numeric_limits<Torus>::max();
-      temp[i].y /= (double)std::numeric_limits<Torus>::max();
-      tid += params::degree / params::opt;
-    }
-
-    synchronize_threads_in_block();
    // Move from local memory back to shared memory but as complex
-    tid = threadIdx.x;
+    int tid = threadIdx.x;
    double2 *fft = (double2 *)selected_memory;
 #pragma unroll
    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] = temp[i];
+      fft[tid] =
+          make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
+                           (double)std::numeric_limits<Torus>::max(),
+                       __ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) /
+                           (double)std::numeric_limits<Torus>::max());
      tid += params::degree / params::opt;
    }
-    synchronize_threads_in_block();
+
    NSMFFT_direct<HalfDegree<params>>(fft);

    // lwe iteration
@@ -251,7 +252,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
        uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
        uint32_t lwe_chunk_size, int8_t *device_mem,
-        uint64_t device_memory_size_per_block) {
+        uint64_t device_memory_size_per_block, uint32_t lut_count,
+        uint32_t lut_stride) {
  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
  // much faster than global memory
@@ -324,8 +326,38 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
      sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             global_slice, glwe_dimension,
+                                             i * lut_stride);
+        }
+      }
    } else if (blockIdx.y == glwe_dimension) {
      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             global_slice, 0, i * lut_stride);
+        }
+      }
    }
  }
 }
@@ -353,101 +385,9 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
 template <typename Torus, typename params>
 __host__ void scratch_multi_bit_programmable_bootstrap(
    cudaStream_t stream, uint32_t gpu_index,
-    pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    bool allocate_gpu_memory) {
-
-  int max_shared_memory = cuda_get_max_shared_memory(0);
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate_step_one =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate_step_two =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
-          polynomial_size);
-  uint64_t partial_sm_accumulate_step_one =
-      get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
-          Torus>(polynomial_size);
-
-  if (max_shared_memory < full_sm_keybundle) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
-                                                          FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
-  if (max_shared_memory < partial_sm_accumulate_step_one) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, NOSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory < full_sm_accumulate_step_one) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        partial_sm_accumulate_step_one));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        full_sm_accumulate_step_one));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_accumulate_step_one<
-            Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
-  if (max_shared_memory < full_sm_accumulate_step_two) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_accumulate_step_two<
-            Torus, params, NOSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_accumulate_step_two<
-            Torus, params, NOSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_multi_bit_programmable_bootstrap_accumulate_step_two<
-            Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        full_sm_accumulate_step_two));
-    cudaFuncSetCacheConfig(
-        device_multi_bit_programmable_bootstrap_accumulate_step_two<
-            Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
+    pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
      gpu_index, input_lwe_ciphertext_count, polynomial_size);
@@ -463,11 +403,13 @@ __host__ void execute_compute_keybundle(
    Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {

+  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;

  uint32_t keybundle_size_per_input =
      lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -505,14 +447,12 @@ __host__ void execute_compute_keybundle(
 }

 template <typename Torus, class params>
-__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lut_vector, Torus *lut_vector_indexes,
-                               Torus *lwe_array_in, Torus *lwe_input_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t base_log,
-                               uint32_t level_count, int j, int lwe_offset) {
+__host__ void execute_step_one(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {

  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
@@ -561,15 +501,14 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
 }

 template <typename Torus, class params>
-__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lwe_array_out, Torus *lwe_output_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size,
-                               int32_t grouping_factor, uint32_t level_count,
-                               int j, int lwe_offset, uint32_t lwe_chunk_size) {
+__host__ void execute_step_two(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
+    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
+    uint32_t j, uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {

+  auto lwe_chunk_size = buffer->lwe_chunk_size;
  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
          polynomial_size);
@@ -591,7 +530,8 @@ __host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
            lwe_array_out, lwe_output_indexes, keybundle_fft,
            global_accumulator, global_accumulator_fft, lwe_dimension,
            glwe_dimension, polynomial_size, level_count, grouping_factor, j,
-            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two);
+            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two,
+            lut_count, lut_stride);
  else
    device_multi_bit_programmable_bootstrap_accumulate_step_two<Torus, params,
                                                                FULLSM>
@@ -599,7 +539,8 @@ __host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
           stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
                     global_accumulator, global_accumulator_fft, lwe_dimension,
                     glwe_dimension, polynomial_size, level_count,
-                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0);
+                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0,
+                     lut_count, lut_stride);
  check_cuda_error(cudaGetLastError());
 }

@@ -610,10 +551,10 @@ __host__ void host_multi_bit_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, num_samples, polynomial_size);
+  auto lwe_chunk_size = buffer->lwe_chunk_size;

  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {
@@ -622,11 +563,11 @@ __host__ void host_multi_bit_programmable_bootstrap(
    execute_compute_keybundle<Torus, params>(
        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
+        grouping_factor, level_count, lwe_offset);
    // Accumulate
    uint32_t chunk_size = std::min(
        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-    for (int j = 0; j < chunk_size; j++) {
+    for (uint32_t j = 0; j < chunk_size; j++) {
      execute_step_one<Torus, params>(
          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
@@ -635,7 +576,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
      execute_step_two<Torus, params>(
          stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
          num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, j, lwe_offset, lwe_chunk_size);
+          grouping_factor, level_count, j, lwe_offset, lut_count, lut_stride);
    }
  }
 }
--- a/Show More
+++ b/Show More