fix correct number of blocks per iteration

fix encryption each iteration
fix proper output
2026-04-28 03:01:21 -04:00 · 2024-09-30 16:08:05 +02:00 · 2024-09-26 08:30:00 +00:00 · 2024-09-26 07:52:16 +00:00 · 2024-09-25 18:12:41 +02:00 · 2024-09-24 09:22:02 +00:00
323 changed files with 17626 additions and 5396 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,6 +47,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -103,7 +104,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -26,6 +26,7 @@ jobs:
    outputs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.core_crypto_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
@@ -56,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -64,10 +65,15 @@ jobs:
              - tfhe/Cargo.toml
              - concrete-csprng/**
              - tfhe-zk-pok/**
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
            csprng:
              - concrete-csprng/**
            zk_pok:
              - tfhe-zk-pok/**
+            versionable:
+              - utils/tfhe-versionable/**
+              - utils/tfhe-versionable-derive/**
            core_crypto:
              - tfhe/src/core_crypto/**
            boolean:
@@ -103,6 +109,7 @@ jobs:
        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.versionable_any_changed == 'true' ||
          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
          steps.changed-files.outputs.boolean_any_changed == 'true' ||
          steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -124,7 +131,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -147,6 +154,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -167,6 +175,11 @@ jobs:
        run: |
          make test_zk_pok

+      - name: Run tfhe-versionable tests
+        if: needs.should-run.outputs.versionable_test == 'true'
+        run: |
+          make test_versionable
+
      - name: Run core tests
        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
@@ -213,7 +226,7 @@ jobs:
          make test_safe_deserialization

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -228,7 +241,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -26,6 +26,11 @@ on:

 jobs:
  should-run:
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
      pull-requests: write
@@ -41,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -67,7 +72,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -88,6 +93,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: "false"
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -120,7 +126,7 @@ jobs:
          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -135,7 +141,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -26,6 +26,11 @@ on:

 jobs:
  should-run:
+    if:
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
+      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
      pull-requests: write
@@ -41,7 +46,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -67,7 +72,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -88,6 +93,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: "false"
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -124,7 +130,7 @@ jobs:
          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -139,7 +145,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -63,7 +63,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -131,7 +131,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -154,6 +154,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -222,7 +223,7 @@ jobs:
          make test_kreyvium

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -237,7 +238,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,6 +48,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -71,11 +72,13 @@ jobs:
          make test_nodejs_wasm_api_in_docker

      - name: Run parallel wasm tests
+        # test timeouts are at 60 but if we want a log we need to give time to the step to log stuff
+        timeout-minutes: 65
        run: |
          make test_web_js_api_parallel_ci

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -90,7 +93,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,6 +51,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -113,16 +114,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -140,7 +133,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,6 +47,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -101,16 +102,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -128,7 +121,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -42,6 +42,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -90,19 +91,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -184,7 +177,7 @@ jobs:
          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
-        if: ${{ !success() && !cancelled() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,6 +68,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -143,16 +144,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
@@ -175,7 +168,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer.yml
+++ b/.github/workflows/benchmark_gpu_integer.yml
@@ -30,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -71,6 +71,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -159,16 +160,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
@@ -191,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_full.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
      max-parallel: 1
      matrix:
        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]
+        op_flavor: [default]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -75,6 +75,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -129,6 +130,12 @@ jobs:
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu

+      # Run these benchmarks only once
+      - name: Run compression benchmarks with AVX512
+        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
+        run: |
+          make bench_integer_compression_gpu
+
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -152,22 +159,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-full-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -184,7 +183,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_multi_bit.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit.yml
@@ -42,7 +42,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,6 +84,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -182,23 +183,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -215,7 +207,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_bit_multi_gpu.yml
@@ -42,7 +42,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -85,6 +85,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -172,22 +173,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -204,7 +197,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
+++ b/.github/workflows/benchmark_gpu_integer_multi_gpu_full.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -39,7 +39,7 @@ jobs:
          profile: multi-h100

  cuda-integer-full-multi-gpu-benchmarks:
-    name: Execute multi GPU integer benchmarks for all operations flavor
+    name: Execute multi GPU integer benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
@@ -48,8 +48,8 @@ jobs:
      fail-fast: false
      max-parallel: 1
      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]
+        command: [integer_multi_bit]
+        op_flavor: [default]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -75,6 +75,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -152,22 +153,14 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -184,7 +177,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_cpu_benchmark.yml
+++ b/.github/workflows/integer_cpu_benchmark.yml
@@ -62,7 +62,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,6 +90,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -125,6 +126,12 @@ jobs:
        run: |
          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}

+      # Run these benchmarks only once
+      - name: Run compression benchmarks with AVX512
+        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
+        run: |
+          make bench_integer_compression
+
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -147,16 +154,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -174,7 +173,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/shortint_cpu_benchmark.yml
+++ b/.github/workflows/shortint_cpu_benchmark.yml
@@ -56,7 +56,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -82,6 +82,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -149,16 +150,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -176,7 +169,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/signed_integer_cpu_benchmark.yml
+++ b/.github/workflows/signed_integer_cpu_benchmark.yml
@@ -62,7 +62,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,6 +90,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -147,16 +148,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -174,7 +167,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -39,7 +39,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -64,7 +64,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,6 +83,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -103,6 +104,8 @@ jobs:
          toolchain: nightly

      - name: Run benchmarks
+        # test timeouts are at 60 but if we want a log we need to give time to the step to log stuff
+        timeout-minutes: 65
        run: |
          make install_node
          make bench_web_js_api_parallel_ci
@@ -145,16 +148,8 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
        if: ${{ failure() }}
@@ -172,7 +167,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -36,7 +36,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,6 +87,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -153,19 +154,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ !success() && !cancelled() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -180,7 +173,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -25,3 +25,9 @@ jobs:
      - name: Lint workflows
        run: |
          make lint_workflow
+
+      - name: Ensure SHA pinned actions
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@0901cf7b71c7ea6261ec69a3dc2bd3f9264f893e # v3.0.12
+        with:
+          allowlist: |
+            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -57,7 +57,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          files_yaml: |
            tfhe:
@@ -125,7 +125,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,6 +48,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -78,7 +79,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -37,6 +37,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -74,7 +75,7 @@ jobs:
          github_token: ${{ secrets.GITHUB_TOKEN }}

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,6 +111,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -146,7 +147,8 @@ jobs:

      - name: Run core crypto and internal CUDA backend tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=FALSE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=FALSE make test_integer_compression_gpu
          BIG_TESTS_INSTANCE=TRUE make test_cuda_backend

      - name: Run user docs tests
@@ -165,7 +167,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -182,7 +184,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -33,7 +33,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -63,7 +63,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -109,6 +109,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -145,6 +146,7 @@ jobs:
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
+          make test_integer_compression_gpu
          make test_cuda_backend

      - name: Run user docs tests
@@ -163,7 +165,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -180,7 +182,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -0,0 +1,156 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Full tests on H100
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+
+on:
+  workflow_dispatch:
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-h100-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-tests-linux:
+    name: CUDA H100 tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev 
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-h100-tests)
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,6 +111,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -144,6 +145,10 @@ jobs:
        if: ${{ !cancelled() }}
        run: nvidia-smi

+      - name: Run multi-bit CUDA integer compression tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
+
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
@@ -165,7 +170,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -182,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -56,6 +56,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -94,7 +95,7 @@ jobs:
          make pcc_gpu

      - name: Slack Notification
-        if: ${{ always() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
@@ -109,7 +110,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -31,10 +31,11 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +66,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -154,7 +155,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -171,7 +172,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -42,7 +42,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -73,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -119,6 +119,7 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          persist-credentials: 'false'
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -168,7 +169,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -185,7 +186,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -31,10 +31,11 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +66,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -154,7 +155,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -171,7 +172,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -38,10 +38,11 @@ jobs:
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
+        uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -72,7 +73,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -165,7 +166,7 @@ jobs:
    name: Slack Notification
    needs: [ setup-instance, cuda-unsigned-integer-tests ]
    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
+    if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
@@ -182,7 +183,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -29,14 +29,14 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: aws
-          profile: gpu-test
+          profile: gpu-build

  publish-cuda-release:
    name: Publish CUDA Release
@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/30
+++ b/30
@@ -284,6 +284,9 @@ clippy_c_api: install_rs_check_toolchain

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
@@ -470,17 +473,24 @@ test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu:: --test-threads=1
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu:: --test-threads=1

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

+.PHONY: test_integer_compression_gpu
+test_integer_compression_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests:: --test-threads=1
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress --test-threads=1
+
 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -734,7 +744,7 @@ test_zk_pok: install_rs_build_toolchain
 .PHONY: test_versionable # Run tests for tfhe-versionable subcrate
 test_versionable: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe-versionable
+		--all-targets -p tfhe-versionable

 # The backward compat data repo holds historical binary data but also rust code to generate and load them.
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
@@ -883,6 +893,18 @@ bench_integer_gpu: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

+.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
+bench_integer_compression: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench	glwe_packing_compression-integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+
+.PHONY: bench_integer_compression_gpu
+bench_integer_compression_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench	glwe_packing_compression-integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
--- a/apps/trivium/src/lib.rs
+++ b/apps/trivium/src/lib.rs
@@ -1,3 +1,5 @@
+#![allow(clippy::too_long_first_doc_paragraph)]
+
 mod static_deque;

 mod kreyvium;
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.4.0-alpha.0"
+version = "0.4.0-alpha.1"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -67,9 +67,21 @@ endif()

 add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

+# Check if the DEBUG flag is defined
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  # Debug mode
+  message("Compiling in Debug mode")
+  add_definitions(-DDEBUG)
+  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
+else()
+  # Release mode
+  message("Compiling in Release mode")
+  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
+endif()
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
+    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
  --use_fast_math -Xcompiler -fPIC")

--- a/backends/tfhe-cuda-backend/cuda/include/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/compression.h
@@ -18,7 +18,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory);

 void cuda_integer_compress_radix_ciphertext_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -94,6 +95,7 @@ template <typename Torus> struct int_decompression {

  uint32_t storage_log_modulus;

+  uint32_t num_lwes;
  uint32_t body_count;

  Torus *tmp_extracted_glwe;
@@ -104,12 +106,13 @@ template <typename Torus> struct int_decompression {
  int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
                    uint32_t gpu_count, int_radix_params encryption_params,
                    int_radix_params compression_params,
-                    uint32_t num_radix_blocks, uint32_t storage_log_modulus,
-                    bool allocate_gpu_memory) {
+                    uint32_t num_radix_blocks, uint32_t body_count,
+                    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
    this->storage_log_modulus = storage_log_modulus;
-    this->body_count = num_radix_blocks;
+    this->num_lwes = num_radix_blocks;
+    this->body_count = body_count;

    if (allocate_gpu_memory) {
      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -39,10 +39,6 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

-bool cuda_check_support_cooperative_groups();
-
-bool cuda_check_support_thread_block_clusters();
-
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

@@ -62,9 +58,13 @@ void cuda_synchronize_device(uint32_t gpu_index);
 void cuda_drop(void *ptr, uint32_t gpu_index);

 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+}

 int cuda_get_max_shared_memory(uint32_t gpu_index);
-}
+
+bool cuda_check_support_cooperative_groups();
+
+bool cuda_check_support_thread_block_clusters();

 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,7 +8,7 @@ extern std::mutex m;
 extern bool p2p_enabled;

 extern "C" {
-int cuda_setup_multi_gpu();
+int32_t cuda_setup_multi_gpu();
 }

 // Define a variant type that can be either a vector or a single pointer
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -80,6 +80,11 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
                                            uint32_t gpu_count,
                                            int8_t **mem_ptr_void);

+void cuda_apply_many_univariate_lut_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
+    void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride);
+
 void scratch_cuda_full_propagation_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -112,10 +117,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
                               uint32_t gpu_count, int8_t **mem_ptr_void);

-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus);
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus);

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
@@ -385,8 +391,8 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
-    void **bsks, uint32_t num_blocks, uint32_t shift);
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -970,28 +976,52 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
                                                (params.big_lwe_dimension + 1) *
                                                sizeof(Torus),
                                            streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_bits, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
      tmp_shift_bits = (Torus *)cuda_malloc_async(
          max_num_bits_that_tell_shift * num_radix_blocks *
              (params.big_lwe_dimension + 1) * sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_shift_bits, 0,
+                        max_num_bits_that_tell_shift * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);

      tmp_rotated = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_rotated, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);

      tmp_input_bits_a = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_input_bits_a, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
      tmp_input_bits_b = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_input_bits_b, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
      tmp_mux_inputs = (Torus *)cuda_malloc_async(
          bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
              sizeof(Torus),
          streams[0], gpu_indexes[0]);
+      cuda_memset_async(tmp_mux_inputs, 0,
+                        bits_per_block * num_radix_blocks *
+                            (params.big_lwe_dimension + 1) * sizeof(Torus),
+                        streams[0], gpu_indexes[0]);

      auto mux_lut_f = [](Torus x) -> Torus {
        // x is expected to be x = 0bcba
@@ -1151,6 +1181,11 @@ template <typename Torus> struct int_sc_prop_memory {
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
    step_output = (Torus *)cuda_malloc_async(
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(generates_or_propagates, 0,
+                      num_radix_blocks * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+    cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);

    // declare functions for lut generation
    auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
@@ -1267,6 +1302,11 @@ template <typename Torus> struct int_overflowing_sub_memory {
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
    step_output = (Torus *)cuda_malloc_async(
        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    cuda_memset_async(generates_or_propagates, 0,
+                      num_radix_blocks * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
+    cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);

    // declare functions for lut generation
    auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
@@ -1356,6 +1396,7 @@ template <typename Torus> struct int_overflowing_sub_memory {

 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
  Torus *new_blocks;
+  Torus *new_blocks_copy;
  Torus *old_blocks;
  Torus *small_lwe_vector;
  int_radix_params params;
@@ -1383,17 +1424,40 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    new_blocks = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
    old_blocks = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
    small_lwe_vector = (Torus *)cuda_malloc_async(
        max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
        streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks_copy, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(old_blocks, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(small_lwe_vector, 0,
+                      max_pbs_count * (params.small_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);

    d_smart_copy_in = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
    d_smart_copy_out = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
  }

  int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -1414,11 +1478,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    this->new_blocks = new_blocks;
    this->old_blocks = old_blocks;
    this->small_lwe_vector = small_lwe_vector;
+    new_blocks_copy = (Torus *)cuda_malloc_async(
+        max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
+        streams[0], gpu_indexes[0]);
+    cuda_memset_async(new_blocks_copy, 0,
+                      max_pbs_count * (params.big_lwe_dimension + 1) *
+                          sizeof(Torus),
+                      streams[0], gpu_indexes[0]);

    d_smart_copy_in = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
    d_smart_copy_out = (int32_t *)cuda_malloc_async(
        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
+    cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
+                      streams[0], gpu_indexes[0]);
  }

  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -1432,8 +1507,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
    }

+    cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
    scp_mem->release(streams, gpu_indexes, gpu_count);
-
    delete scp_mem;
  }
 };
@@ -2087,7 +2162,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {

    if (allocate_gpu_memory) {
      Torus total_modulus = params.message_modulus * params.carry_modulus;
-      uint32_t max_value = total_modulus - 1;
+      uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);

      int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
      tmp_block_accumulated = (Torus *)cuda_malloc_async(
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -69,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -77,18 +77,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);

 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);
-
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count);
-
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count);
 }

 template <typename Torus>
@@ -339,7 +331,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -348,7 +341,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -358,7 +352,8 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -27,7 +27,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
@@ -58,7 +59,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);
 #endif

 template <typename Torus>
@@ -74,7 +76,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -90,7 +93,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -27,7 +27,7 @@ private:

 public:
  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
-                          uint32_t num_poly)
+                          uint32_t num_poly = 1)
      : base_log(base_log), level_count(level_count), num_poly(num_poly),
        state(state) {

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_output_indexes),
@@ -66,7 +66,7 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {

-  host_packing_keyswitch_lwe_list_to_glwe(
+  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(glwe_array_out),
      static_cast<uint64_t *>(lwe_array_in),
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -280,6 +280,12 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_lwes) {
+
+  if (num_lwes > polynomial_size)
+    PANIC("Cuda error: too many LWEs to pack. The number of LWEs should be "
+          "smaller than "
+          "polynomial_size.")
+
  cudaSetDevice(gpu_index);
  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -177,8 +177,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    int num_blocks = (n + block_size - 1) / block_size;

    // Launch the kernel
-    cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
-                                                                 n);
+    cuda_set_value_kernel<Torus>
+        <<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
    check_cuda_error(cudaGetLastError());
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -37,12 +37,12 @@ void host_resolve_signed_overflow(
      streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
      mem->params.big_lwe_dimension, 1);

-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, x, mem->params.big_lwe_dimension,
-                1);
-  host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
-                last_block_inner_propagation, last_block_input_carry,
-                mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, x,
+                       mem->params.big_lwe_dimension, 1);
+  host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
+                       last_block_inner_propagation, last_block_input_carry,
+                       mem->params.big_lwe_dimension, 1);

  host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
                                      last_block_inner_propagation,
@@ -94,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(

  // phase 1
  if (op == SIGNED_OPERATION::ADDITION) {
-    host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
+                         big_lwe_dimension, num_blocks);
  } else {
-    host_integer_radix_negation(
+    host_integer_radix_negation<Torus>(
        streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
-    host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
-                  big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
+                         big_lwe_dimension, num_blocks);
  }

  // phase 2
@@ -109,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }

-  host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
-                              result, output_carry, input_carries,
-                              mem_ptr->scp_mem, bsks, ksks, num_blocks);
-  host_generate_last_block_inner_propagation(
+  host_propagate_single_carry<Torus>(
+      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
+      input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
+  host_generate_last_block_inner_propagation<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
      last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
      &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
@@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
  // phase 3
  auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];

-  host_resolve_signed_overflow(
+  host_resolve_signed_overflow<Torus>(
      streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
      input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };

-  scratch_cuda_integer_radix_cmux_kb(
+  scratch_cuda_integer_radix_cmux_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;

-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
-        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
-        params.message_modulus, 1);
+    device_pack_bivariate_blocks<Torus>
+        <<<num_blocks, num_threads, 0, streams[0]>>>(
+            lwe_array_out_block, predicate->lwe_indexes_in,
+            lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
+            params.big_lwe_dimension, params.message_modulus, 1);
    check_cuda_error(cudaGetLastError());
  }

@@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb(
  }

  auto mem_true = mem_ptr->zero_if_true_buffer;
-  zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-              lwe_array_true, lwe_condition, mem_true,
-              mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                     lwe_array_true, lwe_condition, mem_true,
+                     mem_ptr->inverted_predicate_lut, bsks, ksks,
+                     num_radix_blocks);
  auto mem_false = mem_ptr->zero_if_false_buffer;
-  zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
-              lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
-              bsks, ksks, num_radix_blocks);
+  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
+                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
+                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
+                     num_radix_blocks);
  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
  }
@@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
-                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
-                num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
+                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
+                       params.big_lwe_dimension, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  // Add all blocks and store in sum
-  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
      output, input, lwe_dimension, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }
@@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -75,7 +74,7 @@ __host__ void are_all_comparisons_block_true(
  auto tmp_out = are_all_block_true_buffer->tmp_out;

  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = total_modulus - 1;
+  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);

  cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
                               num_radix_blocks * (big_lwe_dimension + 1) *
@@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true(
    auto is_equal_to_num_blocks_map =
        &are_all_block_true_buffer->is_equal_to_lut_map;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -121,9 +121,8 @@ __host__ void are_all_comparisons_block_true(
            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                     max_value, num_radix_blocks, true);

-        auto is_equal_to_num_blocks_lut_f = [max_value,
-                                             chunk_length](Torus x) -> Torus {
-          return (x & max_value) == chunk_length;
+        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
+          return x == chunk_length;
        };
        generate_device_accumulator<Torus>(
            streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
@@ -165,7 +164,6 @@ __host__ void is_at_least_one_comparisons_block_true(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -174,7 +172,7 @@ __host__ void is_at_least_one_comparisons_block_true(
  auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;

  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = total_modulus - 1;
+  uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);

  cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
                               num_radix_blocks * (big_lwe_dimension + 1) *
@@ -192,8 +190,9 @@ __host__ void is_at_least_one_comparisons_block_true(
    auto input_blocks = mem_ptr->tmp_lwe_array_out;
    auto accumulator = buffer->tmp_block_accumulated;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
-                            input_blocks, big_lwe_dimension, chunk_length);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
+                                   input_blocks, big_lwe_dimension,
+                                   chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -280,8 +279,8 @@ __host__ void host_compare_with_zero_equality(
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

-      accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
-                            big_lwe_dimension, chunk_size);
+      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
+                                   big_lwe_dimension, chunk_size);

      num_sum_blocks++;
      remainder_blocks -= (chunk_size - 1);
@@ -295,8 +294,9 @@ __host__ void host_compare_with_zero_equality(
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
      zero_comparison);
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 sum, mem_ptr, bsks, ksks, num_sum_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, sum, mem_ptr, bsks, ksks,
+                                        num_sum_blocks);
 }

 template <typename Torus>
@@ -310,7 +310,7 @@ __host__ void host_integer_radix_equality_check_kb(

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  integer_radix_apply_bivariate_lookup_table_kb(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
      eq_buffer->operator_lut->params.message_modulus);
@@ -319,9 +319,9 @@ __host__ void host_integer_radix_equality_check_kb(
  //
  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 comparisons, mem_ptr, bsks, ksks,
-                                 num_radix_blocks);
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, comparisons, mem_ptr,
+                                        bsks, ksks, num_radix_blocks);
 }

 template <typename Torus>
@@ -352,19 +352,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
-                   lwe_array_right, big_lwe_dimension, num_radix_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
+                          lwe_array_left, lwe_array_right, big_lwe_dimension,
+                          num_radix_blocks);

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
      num_radix_blocks, is_non_zero_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
+  host_integer_radix_add_scalar_one_inplace<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
      num_radix_blocks, message_modulus, carry_modulus);
 }
@@ -406,8 +407,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,

  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -433,8 +434,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
  std::function<Torus(Torus)> f;

  if (partial_block_count == 2) {
-    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
-                partial_block_count, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                       partial_block_count, 4);

    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
      int msb = (x >> 2) & 3;
@@ -454,9 +455,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                 gpu_count, lwe_array_out, y,
-                                                 bsks, ksks, 1, last_lut);
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
+      last_lut);
 }

 template <typename Torus>
@@ -488,19 +489,21 @@ __host__ void host_integer_radix_difference_check_kb(
    if (mem_ptr->is_signed) {
      packed_num_radix_blocks -= 2;
    }
-    pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
-                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
+                       big_lwe_dimension, packed_num_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
+                       lwe_array_right, big_lwe_dimension,
+                       packed_num_radix_blocks, message_modulus);
    // From this point we have half number of blocks
    packed_num_radix_blocks /= 2;

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
        packed_num_radix_blocks, identity_lut);
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
        packed_num_radix_blocks, identity_lut);

@@ -517,16 +520,17 @@ __host__ void host_integer_radix_difference_check_kb(
  if (!mem_ptr->is_signed) {
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
-    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                            rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
+    compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
+                                   lhs, rhs, mem_ptr, bsks, ksks,
+                                   packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                              rhs, mem_ptr, bsks, ksks,
-                              packed_num_radix_blocks);
+      compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                     comparisons, lhs, rhs, mem_ptr, bsks, ksks,
+                                     packed_num_radix_blocks);

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
@@ -535,21 +539,21 @@ __host__ void host_integer_radix_difference_check_kb(
      Torus *last_right_block_before_sign_block =
          diff_buffer->tmp_packed_right +
          packed_num_radix_blocks * big_lwe_size;
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
          1, identity_lut);
-      compare_radix_blocks_kb(
+      compare_radix_blocks_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + packed_num_radix_blocks * big_lwe_size,
          last_left_block_before_sign_block, last_right_block_before_sign_block,
          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -558,11 +562,11 @@ __host__ void host_integer_radix_difference_check_kb(
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
-      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                              lwe_array_left, lwe_array_right, mem_ptr, bsks,
-                              ksks, num_radix_blocks - 1);
+      compare_radix_blocks_kb<Torus>(
+          streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
+          lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -575,9 +579,9 @@ __host__ void host_integer_radix_difference_check_kb(
  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
-  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                      comparisons, mem_ptr->diff_buffer->tree_buffer,
-                      reduction_lut_f, bsks, ksks, num_comparisons);
+  tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             comparisons, mem_ptr->diff_buffer->tree_buffer,
+                             reduction_lut_f, bsks, ksks, num_comparisons);
 }

 template <typename Torus>
@@ -601,16 +605,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                             Torus **ksks, uint32_t total_num_radix_blocks) {

  // Compute the sign
-  host_integer_radix_difference_check_kb(
+  host_integer_radix_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
      ksks, total_num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -14,7 +14,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
      carry_modulus);

-  scratch_cuda_compress_integer_radix_ciphertext_64(
+  scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
@@ -25,24 +25,25 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
+    bool allocate_gpu_memory) {

+  // Decompression doesn't keyswitch, so big and small dimensions are the same
  int_radix_params encryption_params(
      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      (encryption_glwe_dimension + 1) * encryption_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
+      message_modulus, carry_modulus);

  int_radix_params compression_params(
      pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      (compression_glwe_dimension + 1) * compression_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
+      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);

-  scratch_cuda_integer_decompress_radix_ciphertext_64(
+  scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
-      compression_params, storage_log_modulus, allocate_gpu_memory);
+      (int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
+      encryption_params, compression_params, storage_log_modulus,
+      allocate_gpu_memory);
 }
 void cuda_integer_compress_radix_ciphertext_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -37,15 +37,14 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,

 template <typename Torus>
 __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
-                        Torus *array_out, Torus *array_in, uint32_t num_inputs,
-                        uint32_t body_count, int_compression<Torus> *mem_ptr) {
+                        Torus *array_out, Torus *array_in, uint32_t body_count,
+                        int_compression<Torus> *mem_ptr) {
  cudaSetDevice(gpu_index);
  auto params = mem_ptr->compression_params;

  auto log_modulus = mem_ptr->storage_log_modulus;
  auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
  auto number_bits_to_pack = in_len * log_modulus;
-
  auto nbits = sizeof(Torus) * 8;
  // number_bits_to_pack.div_ceil(Scalar::BITS)
  auto len = (number_bits_to_pack + nbits - 1) / nbits;
@@ -55,8 +54,8 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,

  dim3 grid(num_blocks);
  dim3 threads(num_threads);
-  pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus, in_len,
-                                     len);
+  pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
+                                            in_len, len);
 }

 template <typename Torus>
@@ -71,15 +70,16 @@ __host__ void host_integer_compress(cudaStream_t *streams,

  // Shift
  auto lwe_shifted = mem_ptr->tmp_lwe;
-  host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
-                                lwe_array_in,
-                                (uint64_t)compression_params.message_modulus,
-                                input_lwe_dimension, num_lwes);
+  host_cleartext_multiplication<Torus>(
+      streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
+      (uint64_t)compression_params.message_modulus, input_lwe_dimension,
+      num_lwes);

  uint32_t lwe_in_size = input_lwe_dimension + 1;
  uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
                           compression_params.polynomial_size;
  uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
+  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

  // Keyswitch LWEs to GLWE
  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -88,26 +88,24 @@ __host__ void host_integer_compress(cudaStream_t *streams,
    auto lwe_subset = lwe_shifted + i * lwe_in_size;
    auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;

-    host_packing_keyswitch_lwe_list_to_glwe(
+    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
        compression_params.polynomial_size, compression_params.ks_base_log,
-        compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
+        compression_params.ks_level, body_count);
  }

-  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
-
  // Modulus switch
-  host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
-                              num_glwes *
-                                  (compression_params.glwe_dimension *
-                                       compression_params.polynomial_size +
-                                   body_count),
-                              mem_ptr->storage_log_modulus);
+  host_modulus_switch_inplace<Torus>(
+      streams[0], gpu_indexes[0], tmp_glwe_array_out,
+      num_glwes * (compression_params.glwe_dimension *
+                       compression_params.polynomial_size +
+                   body_count),
+      mem_ptr->storage_log_modulus);
  check_cuda_error(cudaGetLastError());

-  host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out,
-            num_glwes, body_count, mem_ptr);
+  host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
+                   tmp_glwe_array_out, body_count, mem_ptr);
 }

 template <typename Torus>
@@ -156,21 +154,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  auto log_modulus = mem_ptr->storage_log_modulus;

  uint32_t body_count = mem_ptr->body_count;
+
  auto initial_out_len =
-      params.glwe_dimension * params.polynomial_size + body_count * body_count;
+      params.glwe_dimension * params.polynomial_size + body_count;

  // We assure the tail of the glwe is zeroed
-  auto zeroed_slice =
-      glwe_array_out + params.glwe_dimension * params.polynomial_size;
-  cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
+  auto zeroed_slice = glwe_array_out + initial_out_len;
+  cuda_memset_async(zeroed_slice, 0,
+                    (params.polynomial_size - body_count) * sizeof(Torus),
                    stream, gpu_index);
-
  int num_blocks = 0, num_threads = 0;
  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
  dim3 grid(num_blocks);
  dim3 threads(num_threads);
-  extract<<<grid, threads, 0, stream>>>(glwe_array_out, array_in, glwe_index,
-                                        log_modulus, initial_out_len);
+  extract<Torus><<<grid, threads, 0, stream>>>(
+      glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len);
  check_cuda_error(cudaGetLastError());
 }

@@ -182,12 +180,18 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
                        uint32_t indexes_array_size, void **bsks,
                        int_decompression<Torus> *mem_ptr) {

+  auto polynomial_size = mem_ptr->encryption_params.polynomial_size;
+  if (indexes_array_size > polynomial_size)
+    PANIC("Cuda error: too many LWEs to decompress. The number of LWEs should "
+          "be smaller than "
+          "polynomial_size.")
+
  auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
  auto compression_params = mem_ptr->compression_params;
-  host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
-               mem_ptr);
+  host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
+                      packed_glwe_in, 0, mem_ptr);

-  auto num_lwes = mem_ptr->body_count;
+  auto num_lwes = mem_ptr->num_lwes;

  // Sample extract
  auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
@@ -196,24 +200,69 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
                              compression_params.glwe_dimension,
                              compression_params.polynomial_size);

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
  /// dimension to a big LWE dimension
  auto encryption_params = mem_ptr->encryption_params;
-  auto carry_extract_lut = mem_ptr->carry_extract_lut;
-  execute_pbs_async<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out,
-      carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
-      carry_extract_lut->lut_indexes_vec, extracted_lwe,
-      carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
-      encryption_params.glwe_dimension,
-      compression_params.glwe_dimension * compression_params.polynomial_size,
-      encryption_params.polynomial_size, encryption_params.pbs_base_log,
-      encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
-      encryption_params.pbs_type);
+  auto lut = mem_ptr->carry_extract_lut;
+  auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
+  if (active_gpu_count == 1) {
+
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_out,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
+        lut->lwe_indexes_in, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type, lut_count, lut_stride);
+  } else {
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
+        compression_params.small_lwe_dimension + 1);
+
+    /// Apply PBS
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type, lut_count, lut_stride);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes, num_lwes,
+                                      encryption_params.big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
 }

 template <typename Torus>
-__host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
+__host__ void scratch_cuda_compress_integer_radix_ciphertext(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_compression<Torus> **mem_ptr, uint32_t num_lwes,
    int_radix_params compression_params, uint32_t lwe_per_glwe,
@@ -225,14 +274,14 @@ __host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
 }

 template <typename Torus>
-__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
+__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
    int_radix_params encryption_params, int_radix_params compression_params,
    uint32_t storage_log_modulus, bool allocate_gpu_memory) {

  *mem_ptr = new int_decompression<Torus>(
      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
-      num_lwes, storage_log_modulus, allocate_gpu_memory);
+      num_lwes, body_count, storage_log_modulus, allocate_gpu_memory);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -282,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          // Shift the mask so that we will only keep bits we should
          uint32_t shifted_mask = full_message_mask >> shift_amount;

-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
              interesting_divisor.last_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_1[shifted_mask]);
@@ -310,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          // the estimated degree of the output is < msg_modulus
          shifted_mask = shifted_mask & full_message_mask;

-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
              divisor_ms_blocks.first_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_2[shifted_mask]);
@@ -334,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          interesting_remainder1.insert(0, numerator_block_1.first_block(),
                                        streams[0], gpu_indexes[0]);

-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
              mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);

@@ -342,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                               interesting_remainder1.len - 1, streams[0],
                               gpu_indexes[0]);

-          host_radix_blocks_rotate_left(
+          host_radix_blocks_rotate_left<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder1.data,
              tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);

@@ -363,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

    auto left_shift_interesting_remainder2 =
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          host_integer_radix_logical_scalar_shift_kb_inplace(
+          host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
              streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
              mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
        }; // left_shift_interesting_remainder2
@@ -396,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    // but in that position, interesting_remainder2 always has a 0
    auto &merged_interesting_remainder = interesting_remainder1;

-    host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
-                  merged_interesting_remainder.data,
-                  interesting_remainder2.data, radix_params.big_lwe_dimension,
-                  merged_interesting_remainder.len);
+    host_addition<Torus>(
+        streams[0], gpu_indexes[0], merged_interesting_remainder.data,
+        merged_interesting_remainder.data, interesting_remainder2.data,
+        radix_params.big_lwe_dimension, merged_interesting_remainder.len);

    // after create_clean_version_of_merged_remainder
    // `merged_interesting_remainder` will be reused as
@@ -439,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        // We could call unchecked_scalar_ne
        // But we are in the special case where scalar == 0
        // So we can skip some stuff
-        host_compare_with_zero_equality(
+        host_compare_with_zero_equality<Torus>(
            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
            mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
@@ -447,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        tmp_1.len =
            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);

-        is_at_least_one_comparisons_block_true(
+        is_at_least_one_comparisons_block_true<Torus>(
            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
            mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
@@ -460,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
    //  `cleaned_merged_interesting_remainder` - radix ciphertext
    auto create_clean_version_of_merged_remainder =
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
-          integer_radix_apply_univariate_lookup_table_kb(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -486,10 +486,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
-                  subtraction_overflowed.data,
-                  at_least_one_upper_block_is_non_zero.data,
-                  radix_params.big_lwe_dimension, 1);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
+                         subtraction_overflowed.data,
+                         at_least_one_upper_block_is_non_zero.data,
+                         radix_params.big_lwe_dimension, 1);

    int factor = (i) ? 3 : 2;
    int factor_lut_id = factor - 2;
@@ -528,10 +528,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);

-      host_addition(streams[0], gpu_indexes[0],
-                    &quotient[block_of_bit * big_lwe_size],
-                    &quotient[block_of_bit * big_lwe_size],
-                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
+      host_addition<Torus>(
+          streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
+          &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
+          radix_params.big_lwe_dimension, 1);
    };

    for (uint j = 0; j < gpu_count; j++) {
@@ -564,17 +564,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
-  host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
-                remainder2.data, radix_params.big_lwe_dimension,
-                remainder1.len);
+  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                       remainder2.data, radix_params.big_lwe_dimension,
+                       remainder1.len);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, num_blocks, mem_ptr->message_extract_lut_2);
  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -53,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus);

-  scratch_cuda_propagate_single_carry_kb_inplace(
+  scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
@@ -131,6 +131,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

+void cuda_apply_many_univariate_lut_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
+    void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
+
+  host_apply_many_univariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
+      lut_count, lut_stride);
+}
+
 void scratch_cuda_apply_bivariate_lut_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -195,15 +208,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
-    void **bsks, uint32_t num_blocks, uint32_t shift) {
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {

  int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;

  host_compute_prefix_sum_hillis_steele<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
-      static_cast<uint64_t *>(input_radix_lwe), params,
+      static_cast<uint64_t *>(generates_or_propagates), params,
      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      num_blocks);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -95,7 +95,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
+  radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -124,8 +124,8 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                  uint32_t lwe_size) {
  cudaSetDevice(gpu_indexes[0]);
  int num_blocks = blocks_count / 2, num_threads = 1024;
-  radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
-      src, blocks_count, lwe_size);
+  radix_blocks_reverse_lwe_inplace<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
 }

 // polynomial_size threads
@@ -164,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
-      lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
-      lwe_dimension, shift, num_radix_blocks);
+  device_pack_bivariate_blocks<Torus>
+      <<<num_blocks, num_threads, 0, streams[0]>>>(
+          lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
+          lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }

@@ -188,6 +189,93 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                                   lwe_trivial_indexes_vec[0], lwe_array_in,
+                                   lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                                   small_lwe_dimension, ks_base_log, ks_level,
+                                   num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
+  } else {
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1);
+
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                                   lwe_array_in_vec, lwe_trivial_indexes_vec,
+                                   ksks, big_lwe_dimension, small_lwe_dimension,
+                                   ks_base_log, ks_level, num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        lut_stride);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t lut_count,
+    uint32_t lut_stride) {
+  // apply_lookup_table
+  auto params = lut->params;
+  auto pbs_type = params.pbs_type;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
+
  /// For multi GPU execution we create vectors of pointers for inputs and
  /// outputs
  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
@@ -210,7 +298,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type);
+        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -236,7 +324,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -271,12 +360,16 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
+
  // Left message is shifted
  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
-  pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
-                        lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
-                        lut->lwe_indexes_in, big_lwe_dimension, shift,
-                        num_radix_blocks);
+  pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
+                               lwe_array_pbs_in, lut->lwe_trivial_indexes,
+                               lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
+                               big_lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());

  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -301,7 +394,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type);
+        grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
  } else {
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    multi_gpu_scatter_lwe_async<Torus>(
@@ -323,7 +416,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -380,7 +474,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
    body[i] = -body[i];
  }

-  rotate_left(body, half_box_size, polynomial_size);
+  rotate_left<Torus>(body, half_box_size, polynomial_size);
 }

 template <typename Torus>
@@ -442,7 +536,6 @@ void generate_device_accumulator_bivariate(
                                         message_modulus, carry_modulus, f);

  // copy host lut and lut_indexes_vec to device
-  cuda_synchronize_stream(stream, gpu_index);
  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                           (glwe_dimension + 1) * polynomial_size *
                               sizeof(Torus),
@@ -508,7 +601,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                               message_modulus, carry_modulus, f);

-  cuda_synchronize_stream(stream, gpu_index);
  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -590,13 +682,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
-  host_compute_prefix_sum_hillis_steele(
+  host_compute_prefix_sum_hillis_steele<Torus>(
      streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
      params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
  if (carry_out != nullptr) {
    cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
                                 streams[0], gpu_indexes[0]);
@@ -610,8 +702,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 gpu_indexes[0]);
  }

-  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
-                glwe_dimension * polynomial_size, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                       step_output, glwe_dimension * polynomial_size,
+                       num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -664,14 +757,15 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
      overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
      big_lwe_size_bytes, streams[0], gpu_indexes[0]);

-  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
-                                 generates_or_propagates, 1, num_blocks,
-                                 big_lwe_size);
+  host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                        step_output, generates_or_propagates, 1,
+                                        num_blocks, big_lwe_size);
  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                    gpu_indexes[0]);

-  host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                   step_output, glwe_dimension * polynomial_size, num_blocks);
+  host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                          step_output, glwe_dimension * polynomial_size,
+                          num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -697,6 +791,9 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
  int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
  int small_lwe_size = (params.small_lwe_dimension + 1);

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (int i = 0; i < num_blocks; i++) {
    auto cur_input_block = &input_blocks[i * big_lwe_size];

@@ -719,7 +816,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
        mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
        params.glwe_dimension, params.small_lwe_dimension,
        params.polynomial_size, params.pbs_base_log, params.pbs_level,
-        params.grouping_factor, 2, params.pbs_type);
+        params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);

    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
                                 big_lwe_size * sizeof(Torus), streams[0],
@@ -727,10 +824,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,

    if (i < num_blocks - 1) {
      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition(streams[0], gpu_indexes[0], next_input_block,
-                    next_input_block,
-                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                    params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
+                           next_input_block,
+                           &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
+                           params.big_lwe_dimension, 1);
    }
  }
 }
@@ -794,7 +891,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
-  device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
+  device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
 }

@@ -840,7 +937,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_create_trivial_radix<<<grid, thds, 0, stream>>>(
+  device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }
@@ -857,7 +954,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
                             uint32_t num_radix_blocks, uint32_t bits_per_block,
                             int_bit_extract_luts_buffer<Torus> *bit_extract) {

-  integer_radix_apply_univariate_lookup_table_kb(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
      num_radix_blocks * bits_per_block, bit_extract->lut);
 }
@@ -870,7 +967,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
             std::function<Torus(Torus)> sign_handler_f, void **bsks,
             Torus **ksks, uint32_t num_sign_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
@@ -904,9 +1000,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    while (num_sign_blocks > 2) {
-      pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
-                  big_lwe_dimension, num_sign_blocks, 4);
-      integer_radix_apply_univariate_lookup_table_kb(
+      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                         big_lwe_dimension, num_sign_blocks, 4);
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
          num_sign_blocks / 2, lut);

@@ -937,11 +1033,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
-                2, 4);
-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_b, bsks, ksks, 1, lut);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                       big_lwe_dimension, 2, 4);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
+        1, lut);

  } else {

@@ -957,9 +1053,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
        final_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
-                                                   gpu_count, signs_array_out,
-                                                   signs_a, bsks, ksks, 1, lut);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
+        1, lut);
  }
 }

@@ -992,6 +1088,18 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
      num_blocks, mem);
 }

+template <typename Torus>
+void host_apply_many_univariate_lut_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut<Torus> *mem,
+    Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count,
+    uint32_t lut_stride) {
+
+  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
+      num_blocks, mem, lut_count, lut_stride);
+}
+
 template <typename Torus>
 void scratch_cuda_apply_bivariate_lut_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -241,7 +241,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 1024:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -249,7 +250,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 2048:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -257,7 +259,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 4096:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -265,7 +268,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 8192:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -273,7 +277,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  case 16384:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -281,7 +286,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
-        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
+        nullptr);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -186,9 +186,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
    uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
-    int_radix_lut<Torus> *reused_lut = nullptr) {
+    int_radix_lut<Torus> *reused_lut) {

  auto new_blocks = mem_ptr->new_blocks;
+  auto new_blocks_copy = mem_ptr->new_blocks_copy;
  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;

@@ -205,12 +206,31 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
  auto small_lwe_size = small_lwe_dimension + 1;

+  // In the case of extracting a single LWE this parameters are dummy
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
+
+  if (num_radix_in_vec == 0)
+    return;
+  if (num_radix_in_vec == 1) {
+    cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
+                                 num_blocks_in_radix * big_lwe_size *
+                                     sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    return;
+  }
  if (old_blocks != terms) {
    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
                                 num_blocks_in_radix * num_radix_in_vec *
                                     big_lwe_size * sizeof(Torus),
                                 streams[0], gpu_indexes[0]);
  }
+  if (num_radix_in_vec == 2) {
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks[num_blocks * big_lwe_size],
+                         big_lwe_dimension, num_blocks);
+    return;
+  }

  size_t r = num_radix_in_vec;
  size_t total_modulus = message_modulus * carry_modulus;
@@ -287,7 +307,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
        terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
        total_count, message_count, carry_count, sm_copy_count);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
@@ -302,8 +321,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    // inside d_smart_copy_in there are only -1 values
    // it's fine to call smart_copy with same pointer
    // as source and destination
-    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
-        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
+    cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
+                                 r * num_blocks * big_lwe_size * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
+        new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
    check_cuda_error(cudaGetLastError());

@@ -346,7 +368,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type);
+          mem_ptr->params.pbs_type, lut_count, lut_stride);
    } else {
      cuda_synchronize_stream(streams[0], gpu_indexes[0]);

@@ -394,7 +416,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
          glwe_dimension, small_lwe_dimension, polynomial_size,
          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
          mem_ptr->params.grouping_factor, total_count,
-          mem_ptr->params.pbs_type);
+          mem_ptr->params.pbs_type, lut_count, lut_stride);

      multi_gpu_gather_lwe_async<Torus>(
          streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
@@ -421,9 +443,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  luts_message_carry->release(streams, gpu_indexes, gpu_count);
  delete (luts_message_carry);

-  host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
-                num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks[num_blocks * big_lwe_size],
+                       big_lwe_dimension, num_blocks);
 }

 template <typename Torus, class params>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -1,14 +1,16 @@
 #include "integer/negation.cuh"

-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
+void cuda_negate_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {

-  host_integer_radix_negation(
+  host_integer_radix_negation<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
-      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }

 void scratch_cuda_integer_radix_overflowing_sub_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -25,14 +25,13 @@ template <typename Torus>
 __global__ void
 device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
                              uint64_t lwe_dimension, uint64_t message_modulus,
-                              uint64_t carry_modulus, uint64_t delta) {
+                              uint64_t delta) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < lwe_dimension + 1) {
    bool is_body = (tid == lwe_dimension);

    // z = ceil( degree / 2^p ) * 2^p
    uint64_t z = (2 * message_modulus - 1) / message_modulus;
-    __syncthreads();
    z *= message_modulus;

    // (0,Delta*z) - ct
@@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,

      uint64_t encoded_zb = zb * delta;

-      __syncthreads();
-
      // (0,Delta*z) - ct
      output[tid] =
          (is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
-      __syncthreads();
    }
  }
 }
@@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);
-  uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);

  // Value of the shift we multiply our messages by
  // If message_modulus and carry_modulus are always powers of 2 we can simplify
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
+  device_integer_radix_negation<<<grid, thds, 0, streams[0]>>>(
      output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
-      carry_modulus, delta);
+      delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb(

  auto radix_params = mem_ptr->params;

-  host_unchecked_sub_with_correcting_term(
+  host_unchecked_sub_with_correcting_term<Torus>(
      streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
      radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
      radix_params.message_modulus, radix_params.carry_modulus,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {

-  host_integer_radix_scalar_addition_inplace(
+  host_integer_radix_scalar_addition_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -18,10 +18,8 @@ __global__ void device_integer_radix_scalar_addition_inplace(

  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < num_blocks) {
-    Torus scalar = scalar_input[tid];
-    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
-
-    *body += scalar * delta;
+    lwe_array[tid * (lwe_dimension + 1) + lwe_dimension] +=
+        scalar_input[tid] * delta;
  }
 }

@@ -45,9 +43,10 @@ __host__ void host_integer_radix_scalar_addition_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_addition_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -83,8 +82,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
-      lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
+  device_integer_radix_add_scalar_one_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
+                                      lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }

@@ -122,10 +122,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
-                                                    streams[0]>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
+  device_integer_radix_scalar_subtraction_inplace<Torus>
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
+                                      input_lwe_ciphertext_count, lwe_dimension,
+                                      delta);
  check_cuda_error(cudaGetLastError());
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -3,6 +3,58 @@

 #include "integer/comparison.cuh"

+template <typename Torus>
+__host__ void scalar_compare_radix_blocks_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {
+
+  if (num_radix_blocks == 0)
+    return;
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
+  // be set to 1
+  // meaning that the output of the pbs will be the negative (modulo message
+  // space)
+  //
+  // Example:
+  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
+  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
+  // Since there was an overflow the bit of padding is 1 and not 0.
+  // When applying the LUT for an input value of 14 we would expect 1,
+  // but since the bit of padding is 1, we will get -1 modulus our message
+  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
+
+  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
+  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
+  // Subtract
+  // Here we need the true lwe sub, not the one that comes from shortint.
+  host_integer_radix_scalar_subtraction_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
+      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
+
+  // Apply LUT to compare to 0
+  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
+      ksks, num_radix_blocks, sign_lut);
+
+  // Add one
+  // Here Lhs can have the following values: (-1) % (message modulus * carry
+  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
+  host_integer_radix_add_scalar_one_inplace<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
+}
+
 template <typename Torus>
 __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -45,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
  if (total_num_scalar_blocks == 0) {
    // We only have to compare blocks with zero
    // means scalar is zero
-    host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
-                                    mem_ptr->tmp_lwe_array_out, lwe_array_in,
-                                    mem_ptr, bsks, ksks, total_num_radix_blocks,
-                                    mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+        lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks,
+        mem_ptr->is_zero_lut);

    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -91,10 +143,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -106,22 +159,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
    //////////////
    // msb
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
@@ -145,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -159,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -173,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 1 if lhs == rhs
    // - 2 if lhs > rhs
    auto comparisons = mem_ptr->tmp_lwe_array_out;
-    scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                                   lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                               comparisons, mem_ptr->diff_buffer->tree_buffer,
+                               sign_handler_f, bsks, ksks,
+                               num_lsb_radix_blocks);
  }
 }

@@ -229,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // We only have to compare blocks with zero
    // means scalar is zero
    Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
-    host_compare_with_zero_equality(
+    host_compare_with_zero_equality<Torus>(
        streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
        mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
    Torus *sign_block =
@@ -277,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
        sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -304,10 +359,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                total_num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       total_num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -319,24 +375,24 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // - 2 if lhs > rhs

    auto comparisons = mem_ptr->tmp_block_comparisons;
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   comparisons, lhs, rhs, mem_ptr, bsks, ksks,
-                                   num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          comparisons, lhs, rhs, mem_ptr, bsks,
+                                          ksks, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
-                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        mem_ptr->identity_lut_f, bsks, ksks,
-                        num_lsb_radix_blocks);
+    tree_sign_reduction<Torus>(
+        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+        mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
+        num_lsb_radix_blocks);
    //////////////
    // msb
    // We remove the last block (which is the sign)
    Torus *are_all_msb_zeros = lwe_array_msb_out;
-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);

    auto sign_bit_pos = (int)log2(message_modulus) - 1;

@@ -371,7 +427,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
        are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
        signed_msb_lut->params.message_modulus);
@@ -382,8 +438,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    //////////////
    // Reduce the two blocks into one final
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        2);

  } else {
    // We only have to do the regular comparison
@@ -403,10 +460,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
-                big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
-                num_lsb_radix_blocks - 1, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                       big_lwe_dimension, num_lsb_radix_blocks - 1,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                       num_lsb_radix_blocks - 1, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -415,19 +473,19 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // - 0 if lhs < rhs
    // - 1 if lhs == rhs
    // - 2 if lhs > rhs
-    scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
-                                   lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
-                                   ksks, num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
+                                          lwe_array_ct_out, lhs, rhs, mem_ptr,
+                                          bsks, ksks, num_lsb_radix_blocks);
    Torus *encrypted_sign_block =
        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
    Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);

    auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-    create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
-                         scalar_sign_block, big_lwe_dimension, 1, 1,
-                         message_modulus, carry_modulus);
+    create_trivial_radix<Torus>(
+        msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
+        big_lwe_dimension, 1, 1, message_modulus, carry_modulus);

-    integer_radix_apply_bivariate_lookup_table_kb(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
        encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
        mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
@@ -439,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
-                 num_lsb_radix_blocks + 1);
+    reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
+                        num_lsb_radix_blocks + 1);
  }
 }

@@ -452,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  // Calculates the difference sign between the ciphertext and the scalar
  // - 0 if lhs < rhs
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
-  integer_radix_signed_scalar_difference_check_kb(
+  integer_radix_signed_scalar_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);
@@ -469,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             sign, lwe_array_left, lwe_array_right,
-                             mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                    lwe_array_out, sign, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -492,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
-    integer_radix_signed_scalar_difference_check_kb(
+    integer_radix_signed_scalar_difference_check_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
  } else {
-    integer_radix_unsigned_scalar_difference_check_kb(
+    integer_radix_unsigned_scalar_difference_check_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
@@ -513,70 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
-    integer_radix_signed_scalar_maxmin_kb(
+    integer_radix_signed_scalar_maxmin_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
        total_num_scalar_blocks);
  } else {
-    integer_radix_unsigned_scalar_maxmin_kb(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
-        total_num_scalar_blocks);
+    PANIC("Cuda error: only signed scalar maxmin can be called in signed "
+          "scalar comparison")
  }
 }

-template <typename Torus>
-__host__ void scalar_compare_radix_blocks_kb(
-    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
-    uint32_t num_radix_blocks) {
-
-  if (num_radix_blocks == 0)
-    return;
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
-  // be set to 1
-  // meaning that the output of the pbs will be the negative (modulo message
-  // space)
-  //
-  // Example:
-  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
-  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
-  // Since there was an overflow the bit of padding is 1 and not 0.
-  // When applying the LUT for an input value of 14 we would expect 1,
-  // but since the bit of padding is 1, we will get -1 modulus our message
-  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
-
-  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
-  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
-                               num_radix_blocks * (big_lwe_dimension + 1) *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-  // Subtract
-  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_integer_radix_scalar_subtraction_inplace(
-      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
-      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
-
-  // Apply LUT to compare to 0
-  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
-      ksks, num_radix_blocks, sign_lut);
-
-  // Add one
-  // Here Lhs can have the following values: (-1) % (message modulus * carry
-  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(
-      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
-      num_radix_blocks, message_modulus, carry_modulus);
-}
-
 template <typename Torus>
 __host__ void host_integer_radix_scalar_maxmin_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -591,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
-  host_integer_radix_scalar_difference_check_kb(
+  host_integer_radix_scalar_difference_check_kb<Torus>(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);
@@ -601,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
-                       scalar_blocks, params.big_lwe_dimension,
-                       total_num_radix_blocks, total_num_scalar_blocks,
-                       params.message_modulus, params.carry_modulus);
+  create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
+                              scalar_blocks, params.big_lwe_dimension,
+                              total_num_radix_blocks, total_num_scalar_blocks,
+                              params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
-                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-                             total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -659,10 +662,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
    auto packed_scalar =
        packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;

-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
-                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-    pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
-                num_scalar_blocks, message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
+                       big_lwe_dimension, num_lsb_radix_blocks,
+                       message_modulus);
+    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
+                       scalar_blocks, 0, num_scalar_blocks, message_modulus);

    cuda_memcpy_async_gpu_to_gpu(
        scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
@@ -670,7 +674,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
        gpu_indexes[0]);
    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
        bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
  }
@@ -689,9 +693,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
      PANIC("Cuda error: integer operation not supported")
    }

-    host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
-                                    lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
-                                    num_msb_radix_blocks, msb_lut);
+    host_compare_with_zero_equality<Torus>(
+        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
+        bsks, ksks, num_msb_radix_blocks, msb_lut);
  }

  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
@@ -701,13 +705,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(

  switch (mem_ptr->op) {
  case COMPARISON_TYPE::EQ:
-    are_all_comparisons_block_true(
+    are_all_comparisons_block_true<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  case COMPARISON_TYPE::NE:
-    is_at_least_one_comparisons_block_true(
+    is_at_least_one_comparisons_block_true<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -65,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix(
      cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
                                   lwe_size_bytes * num_radix_blocks,
                                   streams[0], gpu_indexes[0]);
-      host_integer_radix_logical_scalar_shift_kb_inplace(
+      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
          streams, gpu_indexes, gpu_count, ptr, shift_amount,
          mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
    } else {
@@ -82,15 +82,16 @@ __host__ void host_integer_scalar_mul_radix(
          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
      T *block_shift_buffer =
          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     block_shift_buffer, preshifted_radix_ct,
-                                     i / msg_bits, num_radix_blocks, lwe_size);
+      host_radix_blocks_rotate_right<T>(
+          streams, gpu_indexes, gpu_count, block_shift_buffer,
+          preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
      // create trivial assign for value = 0
      cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
                        streams[0], gpu_indexes[0]);
      j++;
    }
  }
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);

  cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
  mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
@@ -108,7 +109,7 @@ __host__ void host_integer_scalar_mul_radix(
    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j);
+        num_radix_blocks, j, nullptr);

    auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
    host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -70,9 +70,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   giver_blocks, lwe_array, 1, num_blocks,
-                                   big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          giver_blocks, lwe_array, 1,
+                                          num_blocks, big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

@@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

  } else {
    // rotate left as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -97,8 +97,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
-                                  lwe_array, 1, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         giver_blocks, lwe_array, 1, num_blocks,
+                                         big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                   rotated_buffer, lwe_array, rotations,
-                                   num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    // create trivial assign for value = 0
    cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  } else {
    // right shift
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    // rotate left as the blocks are from LSB to MSB
    // create trivial assign for value = 0
@@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  Torus *last_block_copy = &padding_block[big_lwe_size];

  if (mem->shift_type == RIGHT_SHIFT) {
-    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                  rotated_buffer, lwe_array, rotations,
-                                  num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);
    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
                                 gpu_indexes[0]);
@@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      }
      auto lut_univariate_padding_block =
          mem->lut_buffers_univariate[num_bits_in_block - 1];
-      integer_radix_apply_univariate_lookup_table_kb(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
          last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
      // Replace blocks 'pulled' from the left with the correct padding
@@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      if (shift_within_block != 0) {
        auto lut_univariate_shift_last_block =
            mem->lut_buffers_univariate[shift_within_block - 1];
-        integer_radix_apply_univariate_lookup_table_kb(
+        integer_radix_apply_univariate_lookup_table_kb<Torus>(
            mem->local_streams_2, gpu_indexes, gpu_count, last_block,
            last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
      }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -88,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    switch (mem->shift_type) {
    case LEFT_SHIFT:
      // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

      if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
        for (int i = 0; i < rotations; i++)
@@ -103,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
      break;
    case RIGHT_SHIFT:
      // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

      if (mem->is_signed)
        for (int i = 0; i < rotations; i++)
@@ -119,15 +119,15 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
      break;
    case LEFT_ROTATE:
      // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
-                                     rotated_input, input_bits_b, rotations,
-                                     total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    case RIGHT_ROTATE:
      // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
-                                    rotated_input, input_bits_b, rotations,
-                                    total_nb_bits, big_lwe_size);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    default:
      PANIC("Unknown operation")
@@ -135,22 +135,21 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

    // host_pack bits into one block so that we have
    // control_bit|b|a
-    cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
-                      streams[0], gpu_indexes[0]); // Do we need this?
-    pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
-                          mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
-                          mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
-                          total_nb_bits);
+    pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
+                                 mux_lut->lwe_indexes_out, rotated_input,
+                                 input_bits_a, mux_lut->lwe_indexes_in,
+                                 big_lwe_dimension, 2, total_nb_bits);

    // The shift bit is already properly aligned/positioned
    for (int i = 0; i < total_nb_bits; i++)
-      host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
-                    mux_inputs + i * big_lwe_size, shift_bit,
-                    mem->params.big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0],
+                           mux_inputs + i * big_lwe_size,
+                           mux_inputs + i * big_lwe_size, shift_bit,
+                           mem->params.big_lwe_dimension, 1);

    // we have
    // control_bit|b|a
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
        total_nb_bits, mux_lut);
  }
@@ -179,8 +178,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    auto bit_to_add = input_bits_a + i * big_lwe_size;

    for (int j = 0; j < num_radix_blocks; j++) {
-      host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
-                    big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
+                           big_lwe_dimension, 1);

      block += big_lwe_size;
      bit_to_add += bits_per_block * big_lwe_size;
@@ -188,7 +187,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
        num_radix_blocks, cleaning_lut);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in_1),
-                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in_1),
+                          static_cast<uint32_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
@@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in_1),
-                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in_1),
+                          static_cast<uint64_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
 /*
 * Perform the addition of a u32 input LWE ciphertext vector with a u32
@@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint32_t *>(lwe_array_out),
-                          static_cast<uint32_t *>(lwe_array_in),
-                          static_cast<uint32_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint32_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
 * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
@@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
-                          static_cast<uint64_t *>(lwe_array_out),
-                          static_cast<uint64_t *>(lwe_array_in),
-                          static_cast<uint64_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
+  host_addition_plaintext<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -43,7 +43,7 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
  cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
                               (lwe_dimension + 1) * lwe_ciphertext_count,
                               stream, gpu_index);
-  plaintext_addition<<<grid, thds, 0, stream>>>(
+  plaintext_addition<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  addition<T><<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
+  subtraction<T>
+      <<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                   (input_lwe_dimension + 1) * sizeof(T),
                               stream, gpu_index);

-  radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
+  radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
      output, plaintext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term(
  }
 }
 template <typename T>
-
 __host__ void host_unchecked_sub_with_correcting_term(
    cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
@@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
+  unchecked_sub_with_correcting_term<T><<<grid, thds, 0, stream>>>(
      output, input_1, input_2, num_entries, lwe_size, message_modulus,
      carry_modulus, degree);
  check_cuda_error(cudaGetLastError());
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,7 +9,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_vec_multiplication(
+  host_cleartext_vec_multiplication<uint32_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_array_in),
@@ -49,7 +49,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_vec_multiplication(
+  host_cleartext_vec_multiplication<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_in),
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -46,7 +46,7 @@ host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
+  cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -82,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  cleartext_multiplication<<<grid, thds, 0, stream>>>(
+  cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
@@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
@@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
-                static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
+  host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
+  negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -127,7 +127,8 @@ void execute_pbs_async(
    std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) {
+    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, uint32_t lut_count,
+    uint32_t lut_stride) {
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
@@ -159,7 +160,8 @@ void execute_pbs_async(
            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu,
+            lut_count, lut_stride);
      }
      break;
    default:
@@ -198,7 +200,7 @@ void execute_pbs_async(
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
            polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu);
+            num_inputs_on_gpu, lut_count, lut_stride);
      }
      break;
    case CLASSICAL:
@@ -226,7 +228,8 @@ void execute_pbs_async(
            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu,
+            lut_count, lut_stride);
      }
      break;
    default:
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -1,15 +1,5 @@
 #include "programmable_bootstrap_amortized.cuh"

-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count) {
-  return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
-      glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
-}
-
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the amortized PBS on 32 bits inputs, into `buffer`. It also
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -207,9 +207,9 @@ __global__ void device_programmable_bootstrap_amortized(
  // the resulting constant coefficient of the accumulator
  // For the mask it's more complicated
  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
-                                     glwe_dimension, 0);
+                                     glwe_dimension);
  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
-                                     glwe_dimension, 0);
+                                     glwe_dimension);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -44,7 +44,8 @@ __global__ void device_programmable_bootstrap_cg(
    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block) {
+    uint64_t device_memory_size_per_block, uint32_t lut_count,
+    uint32_t lut_stride) {

  grid_group grid = this_grid();

@@ -98,8 +99,8 @@ __global__ void device_programmable_bootstrap_cg(

  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                        params::degree / params::opt>(
-      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, false,
-      1);
+      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+      false);

  for (int i = 0; i < lwe_dimension; i++) {
    synchronize_threads_in_block();
@@ -111,13 +112,13 @@ __global__ void device_programmable_bootstrap_cg(
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat, 1);
+        accumulator, accumulator_rotated, a_hat);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count, 1);
+        accumulator_rotated, base_log, level_count);

    synchronize_threads_in_block();

@@ -125,7 +126,7 @@ __global__ void device_programmable_bootstrap_cg(
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated, 1);
+                                           accumulator_rotated);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
@@ -150,9 +151,39 @@ __global__ void device_programmable_bootstrap_cg(
    // Perform a sample extract. At this point, all blocks have the result, but
    // we do the computation at block 0 to avoid waiting for extra blocks, in
    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    if (lut_count > 1) {
+      for (int i = 1; i < lut_count; i++) {
+        auto next_lwe_array_out =
+            lwe_array_out +
+            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+        auto next_block_lwe_array_out =
+            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                    (glwe_dimension * polynomial_size + 1) +
+                                blockIdx.y * polynomial_size];
+
+        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                           accumulator, glwe_dimension,
+                                           i * lut_stride);
+      }
+    }
  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    if (lut_count > 1) {
+      for (int i = 1; i < lut_count; i++) {
+
+        auto next_lwe_array_out =
+            lwe_array_out +
+            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+        auto next_block_lwe_array_out =
+            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                    (glwe_dimension * polynomial_size + 1) +
+                                blockIdx.y * polynomial_size];
+
+        sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                           accumulator, 0, i * lut_stride);
+      }
+    }
  }
 }

@@ -202,7 +233,8 @@ __host__ void host_programmable_bootstrap_cg(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t lut_count, uint32_t lut_stride) {

  // With SM each block corresponds to either the mask or body, no need to
  // duplicate data for each
@@ -226,7 +258,7 @@ __host__ void host_programmable_bootstrap_cg(
  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);

-  void *kernel_args[14];
+  void *kernel_args[16];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
@@ -240,6 +272,8 @@ __host__ void host_programmable_bootstrap_cg(
  kernel_args[10] = &base_log;
  kernel_args[11] = &level_count;
  kernel_args[12] = &d_mem;
+  kernel_args[14] = &lut_count;
+  kernel_args[15] = &lut_stride;

  if (max_shared_memory < partial_sm) {
    kernel_args[13] = &full_dm;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -30,7 +30,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
-        int8_t *device_mem, uint64_t device_memory_size_per_block) {
+        int8_t *device_mem, uint64_t device_memory_size_per_block,
+        uint32_t lut_count, uint32_t lut_stride) {

  grid_group grid = this_grid();

@@ -86,7 +87,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false, 1);
+        false);
  } else {
    // Load the accumulator calculated in previous iterations
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -98,13 +99,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator, base_log, level_count, 1);
+        accumulator, base_log, level_count);

    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
-                                           1);
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
@@ -130,11 +130,44 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
-                                         0);
+      // Always extract one by default
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, glwe_dimension,
+                                             i * lut_stride);
+        }
+      }
+
    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
-                                         0);
+
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
+      }
    }
  } else {
    // Load the accumulator calculated in previous iterations
@@ -259,7 +292,8 @@ __host__ void execute_cg_external_product_loop(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset, uint32_t lut_count,
+    uint32_t lut_stride) {

  uint64_t full_dm =
      get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
@@ -278,13 +312,15 @@ __host__ void execute_cg_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;

  auto d_mem = buffer->d_mem_acc_cg;
  auto keybundle_fft = buffer->keybundle_fft;
  auto global_accumulator = buffer->global_accumulator;
  auto buffer_fft = buffer->global_accumulator_fft;

-  void *kernel_args[20];
+  void *kernel_args[22];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
@@ -304,6 +340,8 @@ __host__ void execute_cg_external_product_loop(
  kernel_args[16] = &chunk_size;
  kernel_args[17] = &keybundle_size_per_input;
  kernel_args[18] = &d_mem;
+  kernel_args[20] = &lut_count;
+  kernel_args[21] = &lut_stride;

  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
  dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -336,7 +374,8 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
      gpu_index, num_samples, polynomial_size);
@@ -355,7 +394,8 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset,
+        lut_count, lut_stride);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -122,7 +122,8 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  switch (polynomial_size) {
  case 256:
@@ -130,49 +131,56 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 512:
    host_programmable_bootstrap_tbc<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 1024:
    host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 2048:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 4096:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 8192:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 16384:
    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -182,25 +190,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
 }
 #endif

-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count) {
-
-  if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count))
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-  else
-    return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count);
-}
-
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
@@ -389,7 +378,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  switch (polynomial_size) {
  case 256:
@@ -397,49 +387,56 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 512:
    host_programmable_bootstrap_cg<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 1024:
    host_programmable_bootstrap_cg<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 2048:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 4096:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 8192:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 16384:
    host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -455,7 +452,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  switch (polynomial_size) {
  case 256:
@@ -463,49 +461,56 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 512:
    host_programmable_bootstrap<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 1024:
    host_programmable_bootstrap<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 2048:
    host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 4096:
    host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 8192:
    host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case 16384:
    host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -522,7 +527,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 32)
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
@@ -542,7 +547,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
 #else
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -556,7 +562,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -567,7 +574,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        static_cast<uint32_t *>(lwe_array_in),
        static_cast<uint32_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -641,7 +649,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
+    uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
  if (base_log > 64)
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
          "in the ciphertext representation (64)");
@@ -660,7 +668,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
 #else
    PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -674,7 +683,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  case PBS_VARIANT::DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -685,7 +695,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -713,7 +724,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -722,7 +734,8 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
    void *stream, uint32_t gpu_index,
@@ -742,7 +755,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
    void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -751,7 +765,8 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);

 template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
    void *stream, uint32_t gpu_index,
@@ -779,7 +794,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
    void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
    uint64_t *lwe_output_indexes, uint64_t *lut_vector,
@@ -787,7 +803,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride);
 template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
    void *stream, uint32_t gpu_index,
    pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -82,7 +82,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false, 1);
+        false);

    // Persist
    int tid = threadIdx.x;
@@ -102,20 +102,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // Perform ACC * (X^ä - 1)
  multiply_by_monomial_negacyclic_and_sub_polynomial<
      Torus, params::opt, params::degree / params::opt>(global_slice,
-                                                        accumulator, a_hat, 1);
+                                                        accumulator, a_hat);

  // Perform a rounding to increase the accuracy of the
  // bootstrapped ciphertext
  round_to_closest_multiple_inplace<Torus, params::opt,
                                    params::degree / params::opt>(
-      accumulator, base_log, level_count, 1);
+      accumulator, base_log, level_count);

  synchronize_threads_in_block();

  // Decompose the accumulator. Each block gets one level of the
  // decomposition, for the mask and the body (so block 0 will have the
  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
  gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

  // We are using the same memory space for accumulator_fft and
@@ -141,7 +141,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        Torus *global_accumulator, double2 *global_accumulator_fft,
        uint32_t lwe_iteration, uint32_t lwe_dimension,
        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-        int8_t *device_mem, uint64_t device_memory_size_per_block) {
+        int8_t *device_mem, uint64_t device_memory_size_per_block,
+        uint32_t lut_count, uint32_t lut_stride) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -215,11 +216,39 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
-                                         0);
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, glwe_dimension,
+                                             i * lut_stride);
+        }
+      }
    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
-                                         0);
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
+      }
    }
  } else {
    // Persist the updated accumulator
@@ -377,16 +406,15 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
 }

 template <typename Torus, class params>
-__host__ void
-execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, Torus *lut_vector,
-                 Torus *lut_vector_indexes, double2 *bootstrapping_key,
-                 Torus *global_accumulator, double2 *global_accumulator_fft,
-                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-                 int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
-                 uint64_t full_sm, uint64_t full_dm) {
+__host__ void execute_step_two(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    double2 *bootstrapping_key, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration,
+    uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
+    uint64_t full_dm, uint32_t lut_count, uint32_t lut_stride) {

  int max_shared_memory = cuda_get_max_shared_memory(0);
  cudaSetDevice(gpu_index);
@@ -399,21 +427,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm);
+            level_count, d_mem, full_dm, lut_count, lut_stride);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
        <<<grid, thds, partial_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm);
+            level_count, d_mem, partial_dm, lut_count, lut_stride);
  } else {
    device_programmable_bootstrap_step_two<Torus, params, FULLSM>
        <<<grid, thds, full_sm, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            bootstrapping_key, global_accumulator, global_accumulator_fft,
            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0);
+            level_count, d_mem, 0, lut_count, lut_stride);
  }
  check_cuda_error(cudaGetLastError());
 }
@@ -427,7 +455,8 @@ __host__ void host_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t lut_count, uint32_t lut_stride) {
  cudaSetDevice(gpu_index);

  // With SM each block corresponds to either the mask or body, no need to
@@ -463,7 +492,8 @@ __host__ void host_programmable_bootstrap(
        lut_vector_indexes, bootstrapping_key, global_accumulator,
        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
+        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
+        lut_count, lut_stride);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -65,7 +65,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -78,7 +79,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 512:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -86,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 1024:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -94,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 2048:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -102,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 4096:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -110,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 8192:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -118,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 16384:
    host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -126,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -142,7 +143,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -155,7 +157,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 512:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -163,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 1024:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -171,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 2048:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -179,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 4096:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -187,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 8192:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -195,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 16384:
    host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -203,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -218,7 +220,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
+    uint32_t lut_stride) {

  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -235,7 +238,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
 #else
    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -250,7 +253,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case PBS_VARIANT::DEFAULT:
    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -262,7 +265,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lwe_input_indexes),
        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -440,6 +443,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,

  int max_blocks_per_sm;
  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
  if (max_shared_memory < full_sm_keybundle)
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_blocks_per_sm,
@@ -499,7 +503,8 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
    void *stream, uint32_t gpu_index,
@@ -515,7 +520,8 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);

 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -586,7 +592,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -599,7 +606,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 512:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -607,7 +614,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 1024:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -615,7 +622,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 2048:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -623,7 +630,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 4096:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -631,7 +638,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 8192:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -639,7 +646,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  case 16384:
    host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -647,7 +654,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples);
+        num_samples, lut_count, lut_stride);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -670,5 +677,6 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,9 +18,9 @@
 #include <vector>

 template <typename Torus, class params>
-__device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
-                                            uint32_t ggsw_idx,
-                                            uint32_t grouping_factor) {
+__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
+                                               uint32_t ggsw_idx,
+                                               uint32_t grouping_factor) {
  Torus x = 0;
  for (int i = 0; i < grouping_factor; i++) {
    uint32_t mask_position = grouping_factor - (i + 1);
@@ -31,6 +31,13 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
  return modulus_switch(x, params::log2_degree + 1);
 }

+__device__ __forceinline__ int
+get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
+                          uint32_t level_count) {
+  return polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) *
+         level_count;
+}
+
 template <typename Torus, class params, sharedMemDegree SMD>
 __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *__restrict__ lwe_array_in,
@@ -60,8 +67,6 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
  uint32_t input_idx = blockIdx.x / lwe_chunk_size;

  if (lwe_iteration < (lwe_dimension / grouping_factor)) {
-    //
-    Torus *accumulator = (Torus *)selected_memory;

    const Torus *block_lwe_array_in =
        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
@@ -81,57 +86,52 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
    const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-    const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+    const Torus *bsk_poly_ini = bsk_slice + poly_id * params::degree;

-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        bsk_poly, accumulator);
+    Torus reg_acc[params::opt];
+
+    copy_polynomial_in_regs<Torus, params::opt, params::degree / params::opt>(
+        bsk_poly_ini, reg_acc);
+
+    int offset =
+        get_start_ith_ggsw_offset(polynomial_size, glwe_dimension, level_count);
+
+    // Precalculate the monomial degrees and store them in shared memory
+    uint32_t *monomial_degrees = (uint32_t *)selected_memory;
+    if (threadIdx.x < (1 << grouping_factor)) {
+      const Torus *lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, threadIdx.x, grouping_factor);
+    }
+    synchronize_threads_in_block();

    // Accumulate the other terms
    for (int g = 1; g < (1 << grouping_factor); g++) {

-      const Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
-          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-      const Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+      uint32_t monomial_degree = monomial_degrees[g];

-      // Calculates the monomial degree
-      const Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, g, grouping_factor);
-
-      synchronize_threads_in_block();
+      const Torus *bsk_poly = bsk_poly_ini + g * offset;
      // Multiply by the bsk element
-      polynomial_accumulate_monic_monomial_mul<Torus>(
-          accumulator, bsk_poly, monomial_degree, threadIdx.x, params::degree,
-          params::opt, false);
+      polynomial_product_accumulate_by_monomial_nosync<Torus, params>(
+          reg_acc, bsk_poly, monomial_degree);
    }
+    synchronize_threads_in_block(); // needed because we are going to reuse the
+                                    // shared memory for the fft

-    synchronize_threads_in_block();
-
-    // Move accumulator to local memory
-    double2 temp[params::opt / 2];
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
-      temp[i].y =
-          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
-      temp[i].x /= (double)std::numeric_limits<Torus>::max();
-      temp[i].y /= (double)std::numeric_limits<Torus>::max();
-      tid += params::degree / params::opt;
-    }
-
-    synchronize_threads_in_block();
    // Move from local memory back to shared memory but as complex
-    tid = threadIdx.x;
+    int tid = threadIdx.x;
    double2 *fft = (double2 *)selected_memory;
 #pragma unroll
    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] = temp[i];
+      fft[tid] =
+          make_double2(__ll2double_rn((int64_t)reg_acc[i]) /
+                           (double)std::numeric_limits<Torus>::max(),
+                       __ll2double_rn((int64_t)reg_acc[i + params::opt / 2]) /
+                           (double)std::numeric_limits<Torus>::max());
      tid += params::degree / params::opt;
    }
-    synchronize_threads_in_block();
+
    NSMFFT_direct<HalfDegree<params>>(fft);

    // lwe iteration
@@ -210,7 +210,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false, 1);
+        false);

    // Persist
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -225,12 +225,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // bootstrapped ciphertext
  round_to_closest_multiple_inplace<Torus, params::opt,
                                    params::degree / params::opt>(
-      accumulator, base_log, level_count, 1);
+      accumulator, base_log, level_count);

  // Decompose the accumulator. Each block gets one level of the
  // decomposition, for the mask and the body (so block 0 will have the
  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
  gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

  // We are using the same memory space for accumulator_fft and
@@ -252,7 +252,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
        uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
        uint32_t lwe_chunk_size, int8_t *device_mem,
-        uint64_t device_memory_size_per_block) {
+        uint64_t device_memory_size_per_block, uint32_t lut_count,
+        uint32_t lut_stride) {
  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
  // much faster than global memory
@@ -324,11 +325,39 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice, 1,
-                                         0);
+      sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             global_slice, glwe_dimension,
+                                             i * lut_stride);
+        }
+      }
    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0,
-                                         0);
+      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             global_slice, 0, i * lut_stride);
+        }
+      }
    }
  }
 }
@@ -467,10 +496,12 @@ __host__ void execute_compute_keybundle(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset) {

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;

  uint32_t keybundle_size_per_input =
      lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -508,14 +539,12 @@ __host__ void execute_compute_keybundle(
 }

 template <typename Torus, class params>
-__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lut_vector, Torus *lut_vector_indexes,
-                               Torus *lwe_array_in, Torus *lwe_input_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t base_log,
-                               uint32_t level_count, int j, int lwe_offset) {
+__host__ void execute_step_one(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {

  uint64_t full_sm_accumulate_step_one =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
@@ -564,14 +593,13 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
 }

 template <typename Torus, class params>
-__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lwe_array_out, Torus *lwe_output_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size,
-                               int32_t grouping_factor, uint32_t level_count,
-                               int j, int lwe_offset, uint32_t lwe_chunk_size) {
+__host__ void execute_step_two(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
+    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
+    uint32_t j, uint32_t lwe_offset, uint32_t lwe_chunk_size,
+    uint32_t lut_count, uint32_t lut_stride) {

  uint64_t full_sm_accumulate_step_two =
      get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
@@ -594,7 +622,8 @@ __host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
            lwe_array_out, lwe_output_indexes, keybundle_fft,
            global_accumulator, global_accumulator_fft, lwe_dimension,
            glwe_dimension, polynomial_size, level_count, grouping_factor, j,
-            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two);
+            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two,
+            lut_count, lut_stride);
  else
    device_multi_bit_programmable_bootstrap_accumulate_step_two<Torus, params,
                                                                FULLSM>
@@ -602,7 +631,8 @@ __host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
           stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
                     global_accumulator, global_accumulator_fft, lwe_dimension,
                     glwe_dimension, polynomial_size, level_count,
-                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0);
+                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0,
+                     lut_count, lut_stride);
  check_cuda_error(cudaGetLastError());
 }

@@ -613,7 +643,8 @@ __host__ void host_multi_bit_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {

  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
      gpu_index, num_samples, polynomial_size);
@@ -629,7 +660,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
    // Accumulate
    uint32_t chunk_size = std::min(
        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-    for (int j = 0; j < chunk_size; j++) {
+    for (uint32_t j = 0; j < chunk_size; j++) {
      execute_step_one<Torus, params>(
          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
@@ -638,7 +669,8 @@ __host__ void host_multi_bit_programmable_bootstrap(
      execute_step_two<Torus, params>(
          stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
          num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, j, lwe_offset, lwe_chunk_size);
+          grouping_factor, level_count, j, lwe_offset, lwe_chunk_size,
+          lut_count, lut_stride);
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -44,7 +44,8 @@ __global__ void device_programmable_bootstrap_tbc(
    const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, bool support_dsm) {
+    uint64_t device_memory_size_per_block, bool support_dsm, uint32_t lut_count,
+    uint32_t lut_stride) {

  cluster_group cluster = this_cluster();

@@ -115,13 +116,13 @@ __global__ void device_programmable_bootstrap_tbc(
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat, 1);
+        accumulator, accumulator_rotated, a_hat);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count, 1);
+        accumulator_rotated, base_log, level_count);

    synchronize_threads_in_block();

@@ -154,9 +155,41 @@ __global__ void device_programmable_bootstrap_tbc(
    // Perform a sample extract. At this point, all blocks have the result, but
    // we do the computation at block 0 to avoid waiting for extra blocks, in
    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+
+    if (lut_count > 1) {
+      for (int i = 1; i < lut_count; i++) {
+        auto next_lwe_array_out =
+            lwe_array_out +
+            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+        auto next_block_lwe_array_out =
+            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                    (glwe_dimension * polynomial_size + 1) +
+                                blockIdx.y * polynomial_size];
+
+        sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                           accumulator, glwe_dimension,
+                                           i * lut_stride);
+      }
+    }
  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+
+    if (lut_count > 1) {
+      for (int i = 1; i < lut_count; i++) {
+
+        auto next_lwe_array_out =
+            lwe_array_out +
+            (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+        auto next_block_lwe_array_out =
+            &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                    (glwe_dimension * polynomial_size + 1) +
+                                blockIdx.y * polynomial_size];
+
+        sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                           accumulator, 0, i * lut_stride);
+      }
+    }
  }
 }

@@ -225,7 +258,8 @@ __host__ void host_programmable_bootstrap_tbc(
    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t lut_count, uint32_t lut_stride) {

  auto supports_dsm =
      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
@@ -281,7 +315,7 @@ __host__ void host_programmable_bootstrap_tbc(
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
-        supports_dsm));
+        supports_dsm, lut_count, lut_stride));
  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
    config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;

@@ -290,7 +324,7 @@ __host__ void host_programmable_bootstrap_tbc(
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        partial_dm, supports_dsm));
+        partial_dm, supports_dsm, lut_count, lut_stride));
  } else {
    config.dynamicSmemBytes = full_sm + minimum_sm_tbc;

@@ -299,7 +333,7 @@ __host__ void host_programmable_bootstrap_tbc(
        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
        lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
-        supports_dsm));
+        supports_dsm, lut_count, lut_stride));
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -31,7 +31,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
        uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
        int8_t *device_mem, uint64_t device_memory_size_per_block,
-        bool support_dsm) {
+        bool support_dsm, uint32_t lut_count, uint32_t lut_stride) {

  cluster_group cluster = this_cluster();

@@ -94,7 +94,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false, 1);
+        false);
  } else {
    // Load the accumulator calculated in previous iterations
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -106,13 +106,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator, base_log, level_count, 1);
+        accumulator, base_log, level_count);

    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
-                                           1);
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
@@ -138,11 +137,40 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
-                                         0);
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_mask<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, glwe_dimension,
+                                             i * lut_stride);
+        }
+      }
    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
-                                         0);
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      if (lut_count > 1) {
+        for (int i = 1; i < lut_count; i++) {
+
+          auto next_lwe_array_out =
+              lwe_array_out +
+              (i * gridDim.z * (glwe_dimension * polynomial_size + 1));
+          auto next_block_lwe_array_out =
+              &next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                                      (glwe_dimension * polynomial_size + 1) +
+                                  blockIdx.y * polynomial_size];
+
+          sample_extract_body<Torus, params>(next_block_lwe_array_out,
+                                             accumulator, 0, i * lut_stride);
+        }
+      }
    }
  } else {
    // Load the accumulator calculated in previous iterations
@@ -270,7 +298,8 @@ __host__ void execute_tbc_external_product_loop(
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, uint32_t lwe_offset, uint32_t lut_count,
+    uint32_t lut_stride) {

  auto supports_dsm =
      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
@@ -297,6 +326,8 @@ __host__ void execute_tbc_external_product_loop(

  uint32_t chunk_size =
      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+  if (chunk_size == 0)
+    return;

  auto d_mem = buffer->d_mem_acc_tbc;
  auto keybundle_fft = buffer->keybundle_fft;
@@ -332,7 +363,8 @@ __host__ void execute_tbc_external_product_loop(
        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, full_dm, supports_dsm));
+        keybundle_size_per_input, d_mem, full_dm, supports_dsm, lut_count,
+        lut_stride));
  } else if (max_shared_memory < full_dm + minimum_dm) {
    config.dynamicSmemBytes = partial_dm + minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
@@ -343,7 +375,8 @@ __host__ void execute_tbc_external_product_loop(
        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, partial_dm, supports_dsm));
+        keybundle_size_per_input, d_mem, partial_dm, supports_dsm, lut_count,
+        lut_stride));
  } else {
    config.dynamicSmemBytes = full_dm + minimum_dm;
    check_cuda_error(cudaLaunchKernelEx(
@@ -354,7 +387,8 @@ __host__ void execute_tbc_external_product_loop(
        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, 0, supports_dsm));
+        keybundle_size_per_input, d_mem, 0, supports_dsm, lut_count,
+        lut_stride));
  }
 }

@@ -365,7 +399,8 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
    Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t lut_count, uint32_t lut_stride) {
  cudaSetDevice(gpu_index);

  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
@@ -385,7 +420,8 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset,
+        lut_count, lut_stride);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -31,6 +31,13 @@ __device__ void copy_polynomial(const T *__restrict__ source, T *dst) {
    tid = tid + block_size;
  }
 }
+template <typename T, int elems_per_thread, int block_size>
+__device__ void copy_polynomial_in_regs(const T *__restrict__ source, T *dst) {
+#pragma unroll
+  for (int i = 0; i < elems_per_thread; i++) {
+    dst[i] = source[threadIdx.x + i * block_size];
+  }
+}

 /*
 * Receives num_poly  concatenated polynomials of type T. For each:
@@ -45,7 +52,7 @@ template <typename T, int elems_per_thread, int block_size>
 __device__ void
 divide_by_monomial_negacyclic_inplace(T *accumulator,
                                      const T *__restrict__ input, uint32_t j,
-                                      bool zeroAcc, uint32_t num_poly) {
+                                      bool zeroAcc, uint32_t num_poly = 1) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
@@ -94,7 +101,7 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
 */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
-    T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
+    T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
@@ -133,7 +140,7 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
 template <typename T, int elems_per_thread, int block_size>
 __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
                                                  int level_count,
-                                                  uint32_t num_poly) {
+                                                  uint32_t num_poly = 1) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
@@ -192,7 +199,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
 // Extracts the body of the nth-LWE in a GLWE.
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
-                                    uint32_t glwe_dimension, uint32_t nth) {
+                                    uint32_t glwe_dimension, uint32_t nth = 0) {
  // Set first coefficient of the glwe as the body of the LWE sample
  lwe_array_out[glwe_dimension * params::degree] =
      glwe[glwe_dimension * params::degree + nth];
@@ -201,7 +208,8 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
 // Extracts the mask from the nth-LWE in a GLWE.
 template <typename Torus, class params>
 __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
-                                    uint32_t glwe_dimension, uint32_t nth) {
+                                    uint32_t glwe_dimension = 1,
+                                    uint32_t nth = 0) {
  for (int z = 0; z < glwe_dimension; z++) {
    Torus *lwe_array_out_slice =
        (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -83,4 +83,29 @@ __device__ void polynomial_accumulate_monic_monomial_mul(
  }
 }

+template <typename T, class params>
+__device__ void polynomial_product_accumulate_by_monomial_nosync(
+    T *result, const T *__restrict__ poly, uint32_t monomial_degree) {
+  // monomial_degree \in [0, 2 * params::degree)
+  int full_cycles_count = monomial_degree / params::degree;
+  int remainder_degrees = monomial_degree % params::degree;
+
+// Every thread has a fixed position to track instead of "chasing" the
+// position
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    int pos =
+        (threadIdx.x + i * (params::degree / params::opt) - monomial_degree) &
+        (params::degree - 1);
+
+    T element = poly[pos];
+    T x = SEL(element, -element, full_cycles_count % 2);
+    x = SEL(-x, x,
+            threadIdx.x + i * (params::degree / params::opt) >=
+                remainder_degrees);
+
+    result[i] += x;
+  }
+}
+
 #endif // CNCRT_POLYNOMIAL_MATH_H
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -6,7 +6,7 @@
 std::mutex m;
 bool p2p_enabled = false;

-int cuda_setup_multi_gpu() {
+int32_t cuda_setup_multi_gpu() {
  int num_gpus = cuda_get_number_of_gpus();
  if (num_gpus == 0)
    PANIC("GPU error: the number of GPUs should be > 0.")
@@ -32,7 +32,7 @@ int cuda_setup_multi_gpu() {
    }
    m.unlock();
  }
-  return num_used_gpus;
+  return (int32_t)(num_used_gpus);
 }

 int get_active_gpu_count(int num_inputs, int gpu_count) {
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -176,22 +176,24 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
  }

  scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
-      stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_level, grouping_factor,
-      input_lwe_ciphertext_count, true);
-
+      stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
+      lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
+      grouping_factor, input_lwe_ciphertext_count, true);
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
-        stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
-        d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
-        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-        pbs_level, input_lwe_ciphertext_count);
-    cuda_synchronize_stream(stream);
+        stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
+        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
+        d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
+        lut_stride);
+    cuda_synchronize_stream(stream, gpu_index);
  }

-  cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
+  cleanup_cuda_multi_bit_programmable_bootstrap(stream, gpu_index, &buffer);
 }
 #endif

@@ -208,7 +210,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
      stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
      true);
-
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -216,7 +219,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
        d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count);
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
+        lut_stride);
    cuda_synchronize_stream(stream, gpu_index);
  }

@@ -229,7 +233,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
      stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
      lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
      grouping_factor, input_lwe_ciphertext_count, true);
-
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -237,7 +242,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
        d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count);
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count, lut_count,
+        lut_stride);
    cuda_synchronize_stream(stream, gpu_index);
  }

@@ -255,23 +261,25 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
  }

  scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
-      stream, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer, glwe_dimension,
-      polynomial_size, pbs_level, input_lwe_ciphertext_count, true);
-
+      stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
+      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
+      true);
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
-        stream, (uint64_t *)d_lwe_ct_out_array,
+        stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
        (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
        (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
        (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count);
-    cuda_synchronize_stream(stream);
+        input_lwe_ciphertext_count, lut_count, lut_stride);
+    cuda_synchronize_stream(stream, gpu_index);
  }

-  cleanup_cuda_programmable_bootstrap(stream, &buffer);
+  cleanup_cuda_programmable_bootstrap(stream, gpu_index, &buffer);
 }
 #endif

@@ -288,7 +296,8 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
      stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
      true);
-
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
@@ -298,7 +307,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
        (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count);
+        input_lwe_ciphertext_count, lut_count, lut_stride);
    cuda_synchronize_stream(stream, gpu_index);
  }

@@ -312,7 +321,8 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
      stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
      glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
      true);
-
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (auto _ : st) {
    // Execute PBS
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -322,7 +332,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
        (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count);
+        input_lwe_ciphertext_count, lut_count, lut_stride);
    cuda_synchronize_stream(stream, gpu_index);
  }

--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -173,6 +173,8 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
                 polynomial_size * (lwe_dimension + 1);
+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  // Here execute the PBS
  for (int r = 0; r < repetitions; r++) {
    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
@@ -190,7 +192,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
          (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
          (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
          lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs);
+          pbs_level, number_of_inputs, lut_count, lut_stride);
      // Copy result back
      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                               (glwe_dimension * polynomial_size + 1) *
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -119,6 +119,8 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
                 (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
                 (1 << grouping_factor);

+  uint32_t lut_count = 1;
+  uint32_t lut_stride = 0;
  for (int r = 0; r < repetitions; r++) {
    uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
    uint64_t *lwe_sk_out =
@@ -135,7 +137,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
          (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
          (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
          glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-          pbs_level, number_of_inputs);
+          pbs_level, number_of_inputs, lut_count, lut_stride);

      // Copy result to the host memory
      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -58,54 +58,4 @@ flavor_name = "n3-A100x8-NVLink"
 [backend.hyperstack.multi-gpu-test]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-flavor_name = "n3-A100x4"
-
-[command.signed_integer_full_bench]
-workflow = "signed_integer_full_benchmark.yml"
-profile = "bench"
-check_run_name = "Signed Integer CPU AWS Benchmarks Full Suite"
-
-[command.integer_full_bench]
-workflow = "integer_full_benchmark.yml"
-profile = "bench"
-check_run_name = "Integer CPU AWS Benchmarks Full Suite"
-
-[command.integer_bench]
-workflow = "integer_benchmark.yml"
-profile = "bench"
-check_run_name = "Integer CPU AWS Benchmarks"
-
-[command.integer_multi_bit_bench]
-workflow = "integer_multi_bit_benchmark.yml"
-profile = "bench"
-check_run_name = "Integer multi bit CPU AWS Benchmarks"
-
-[command.signed_integer_bench]
-workflow = "signed_integer_benchmark.yml"
-profile = "bench"
-check_run_name = "Signed integer CPU AWS Benchmarks"
-
-[command.signed_integer_multi_bit_bench]
-workflow = "signed_integer_multi_bit_benchmark.yml"
-profile = "bench"
-check_run_name = "Signed integer multi bit CPU AWS Benchmarks"
-
-[command.shortint_full_bench]
-workflow = "shortint_full_benchmark.yml"
-profile = "bench"
-check_run_name = "Shortint CPU AWS Benchmarks Full Suite"
-
-[command.shortint_bench]
-workflow = "shortint_benchmark.yml"
-profile = "bench"
-check_run_name = "Shortint CPU AWS Benchmarks"
-
-[command.boolean_bench]
-workflow = "boolean_benchmark.yml"
-profile = "bench"
-check_run_name = "Boolean CPU AWS Benchmarks"
-
-[command.core_crypto_bench]
-workflow = "core_crypto_benchmark.yml"
-profile = "bench"
-check_run_name = "Core crypto CPU AWS Benchmarks"
+flavor_name = "n3-RTX-A6000x4"
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -130,11 +130,11 @@ fi
 # Override test-threads number to avoid Out-of-memory issues on GPU instances
 if [[ "${backend}" == "gpu" ]]; then
    if [[ "${BIG_TESTS_INSTANCE}" == TRUE ]]; then
-        test_threads=8
-        doctest_threads=8
+        test_threads=1
+        doctest_threads=1
    else
-        test_threads=3
-        doctest_threads=3
+        test_threads=1
+        doctest_threads=1
    fi
 fi

--- a/tfhe-zk-pok/Cargo.toml
+++ b/tfhe-zk-pok/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-zk-pok"
-version = "0.3.0-alpha.0"
+version = "0.3.0-alpha.1"
 edition = "2021"
 keywords = ["zero", "knowledge", "proof", "vector-commitments"]
 homepage = "https://zama.ai/"
@@ -15,13 +15,17 @@ description = "tfhe-zk-pok: An implementation of zero-knowledge proofs of encryp
 ark-bls12-381 = { package = "tfhe-ark-bls12-381", version = "0.4.0" }
 ark-ec = { package = "tfhe-ark-ec", version = "0.4.2", features = ["parallel"] }
 ark-ff = { package = "tfhe-ark-ff", version = "0.4.3", features = ["parallel"] }
-ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = ["parallel"] }
+ark-poly = { package = "tfhe-ark-poly", version = "0.4.2", features = [
+    "parallel",
+] }
 ark-serialize = { version = "0.4.2" }
 rand = "0.8.5"
 rayon = "1.8.0"
 sha3 = "0.10.8"
 serde = { version = "~1.0", features = ["derive"] }
 zeroize = "1.7.0"
+num-bigint = "0.4.5"

 [dev-dependencies]
 serde_json = "~1.0"
+itertools = "0.11.0"
--- a/tfhe-zk-pok/src/curve_api.rs
+++ b/tfhe-zk-pok/src/curve_api.rs
@@ -210,9 +210,14 @@ impl CurveGroupOps<bls12_381::Zp> for bls12_381::G1 {
    }

    fn mul_scalar(self, scalar: bls12_381::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_381::Zp]) -> Self {
        Self::Affine::multi_mul_scalar(bases, scalars)
    }
@@ -245,9 +250,14 @@ impl CurveGroupOps<bls12_381::Zp> for bls12_381::G2 {
    }

    fn mul_scalar(self, scalar: bls12_381::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_381::Zp]) -> Self {
        Self::Affine::multi_mul_scalar(bases, scalars)
    }
@@ -273,6 +283,9 @@ impl PairingGroupOps<bls12_381::Zp, bls12_381::G1, bls12_381::G2> for bls12_381:
    }

    fn pairing(x: bls12_381::G1, y: bls12_381::G2) -> Self {
+        if x == bls12_381::G1::ZERO || y == bls12_381::G2::ZERO {
+            return Self::pairing(bls12_381::G1::ZERO, bls12_381::G2::GENERATOR);
+        }
        Self::pairing(x, y)
    }
 }
@@ -329,12 +342,21 @@ impl CurveGroupOps<bls12_446::Zp> for bls12_446::G1 {
    }

    fn mul_scalar(self, scalar: bls12_446::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_446::Zp]) -> Self {
-        msm::msm_wnaf_g1_446(bases, scalars)
-        // Self::Affine::multi_mul_scalar(bases, scalars)
+        // overhead seems to not be worth it outside of wasm
+        if cfg!(target_family = "wasm") {
+            msm::msm_wnaf_g1_446(bases, scalars)
+        } else {
+            Self::Affine::multi_mul_scalar(bases, scalars)
+        }
    }

    fn to_bytes(self) -> impl AsRef<[u8]> {
@@ -365,9 +387,14 @@ impl CurveGroupOps<bls12_446::Zp> for bls12_446::G2 {
    }

    fn mul_scalar(self, scalar: bls12_446::Zp) -> Self {
-        self.mul_scalar(scalar)
+        if scalar.inner == MontFp!("2") {
+            self.double()
+        } else {
+            self.mul_scalar(scalar)
+        }
    }

+    #[track_caller]
    fn multi_mul_scalar(bases: &[Self::Affine], scalars: &[bls12_446::Zp]) -> Self {
        Self::Affine::multi_mul_scalar(bases, scalars)
    }
@@ -393,13 +420,16 @@ impl PairingGroupOps<bls12_446::Zp, bls12_446::G1, bls12_446::G2> for bls12_446:
    }

    fn pairing(x: bls12_446::G1, y: bls12_446::G2) -> Self {
+        if x == bls12_446::G1::ZERO || y == bls12_446::G2::ZERO {
+            return Self::pairing(bls12_446::G1::ZERO, bls12_446::G2::GENERATOR);
+        }
        Self::pairing(x, y)
    }
 }

-#[derive(Copy, Clone, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize)]
 pub struct Bls12_381;
-#[derive(Copy, Clone, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize)]
 pub struct Bls12_446;

 impl Curve for Bls12_381 {
--- a/tfhe-zk-pok/src/curve_api/bls12_446.rs
+++ b/tfhe-zk-pok/src/curve_api/bls12_446.rs
@@ -55,6 +55,7 @@ mod g1 {
    }

    impl G1Affine {
+        #[track_caller]
        pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> G1 {
            // SAFETY: interpreting a `repr(transparent)` pointer as its contents.
            G1 {
@@ -124,6 +125,7 @@ mod g1 {
            }
        }

+        #[track_caller]
        pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> Self {
            use rayon::prelude::*;
            let bases = bases
@@ -230,6 +232,7 @@ mod g2 {
    }

    impl G2Affine {
+        #[track_caller]
        pub fn multi_mul_scalar(bases: &[Self], scalars: &[Zp]) -> G2 {
            // SAFETY: interpreting a `repr(transparent)` pointer as its contents.
            G2 {
@@ -247,10 +250,10 @@ mod g2 {
        // functions. we cache it since it requires a Zp division
        // https://hackmd.io/@tazAymRSQCGXTUKkbh1BAg/Sk27liTW9#Math-Formula-for-Point-Addition
        pub(crate) fn compute_m(self, other: G2Affine) -> Option<crate::curve_446::Fq2> {
-            let zero = crate::curve_446::Fq2::ZERO;
-
            // in the context of elliptic curves, the point at infinity is the zero element of the
            // group
+            let zero = crate::curve_446::Fq2::ZERO;
+
            if self.inner.infinity || other.inner.infinity {
                return None;
            }
--- a/tfhe-zk-pok/src/curve_api/msm.rs
+++ b/tfhe-zk-pok/src/curve_api/msm.rs
@@ -1,6 +1,6 @@
 use ark_ec::short_weierstrass::Affine;
 use ark_ec::AffineRepr;
-use ark_ff::{AdditiveGroup, BigInt, BigInteger, Field, Fp, PrimeField};
+use ark_ff::{AdditiveGroup, BigInteger, Field, Fp, PrimeField};
 use rayon::prelude::*;

 fn make_digits(a: &impl BigInteger, w: usize, num_bits: usize) -> impl Iterator<Item = i64> + '_ {
@@ -46,6 +46,7 @@ fn make_digits(a: &impl BigInteger, w: usize, num_bits: usize) -> impl Iterator<
 }

 // Compute msm using windowed non-adjacent form
+#[track_caller]
 pub fn msm_wnaf_g1_446(
    bases: &[super::bls12_446::G1Affine],
    scalars: &[super::bls12_446::Zp],
@@ -236,207 +237,3 @@ pub fn msm_wnaf_g1_446(
                total
            })
 }
-
-// Compute msm using windowed non-adjacent form
-pub fn msm_wnaf_g1_446_extended(
-    bases: &[super::bls12_446::G1Affine],
-    scalars: &[super::bls12_446::Zp],
-) -> super::bls12_446::G1 {
-    use super::bls12_446::*;
-    type BaseField = Fp<ark_ff::MontBackend<crate::curve_446::FqConfig, 7>, 7>;
-
-    // let num_bits = 75usize;
-    // let mask = BigInt([!0, (1 << 11) - 1, 0, 0, 0]);
-    // let scalars = &*scalars
-    //     .par_iter()
-    //     .map(|x| x.inner.into_bigint())
-    //     .flat_map_iter(|x| (0..4).map(move |i| (x >> (75 * i)) & mask))
-    //     .collect::<Vec<_>>();
-
-    let num_bits = 150usize;
-    let mask = BigInt([!0, !0, (1 << 22) - 1, 0, 0]);
-    let scalars = &*scalars
-        .par_iter()
-        .map(|x| x.inner.into_bigint())
-        .flat_map_iter(|x| (0..2).map(move |i| (x >> (150 * i)) & mask))
-        .collect::<Vec<_>>();
-
-    assert_eq!(bases.len(), scalars.len());
-
-    let size = bases.len();
-
-    let c = if size < 32 {
-        3
-    } else {
-        // natural log approx
-        (size.ilog2() as usize * 69 / 100) + 2
-    };
-    let c = c - 3;
-
-    let digits_count = (num_bits + c - 1) / c;
-    let scalar_digits = scalars
-        .into_par_iter()
-        .flat_map_iter(|s| make_digits(s, c, num_bits))
-        .collect::<Vec<_>>();
-
-    let zero = G1Affine {
-        inner: Affine::zero(),
-    };
-
-    let window_sums: Vec<_> = (0..digits_count)
-        .into_par_iter()
-        .map(|i| {
-            let n = 1 << c;
-            let mut indices = vec![vec![]; n];
-            let mut d = vec![BaseField::ZERO; n + 1];
-            let mut e = vec![BaseField::ZERO; n + 1];
-
-            for (idx, digits) in scalar_digits.chunks(digits_count).enumerate() {
-                use core::cmp::Ordering;
-                // digits is the digits thing of the first scalar?
-                let scalar = digits[i];
-                match 0.cmp(&scalar) {
-                    Ordering::Less => indices[(scalar - 1) as usize].push(idx),
-                    Ordering::Greater => indices[(-scalar - 1) as usize].push(!idx),
-                    Ordering::Equal => (),
-                }
-            }
-
-            let mut buckets = vec![zero; 1 << c];
-
-            loop {
-                d[0] = BaseField::ONE;
-                for (k, (bucket, idx)) in core::iter::zip(&mut buckets, &mut indices).enumerate() {
-                    if let Some(idx) = idx.last().copied() {
-                        let value = if idx >> (usize::BITS - 1) == 1 {
-                            let mut val = bases[!idx];
-                            val.inner.y = -val.inner.y;
-                            val
-                        } else {
-                            bases[idx]
-                        };
-
-                        if !bucket.inner.infinity {
-                            let a = value.inner.x - bucket.inner.x;
-                            if a != BaseField::ZERO {
-                                d[k + 1] = d[k] * a;
-                            } else if value.inner.y == bucket.inner.y {
-                                d[k + 1] = d[k] * value.inner.y.double();
-                            } else {
-                                d[k + 1] = d[k];
-                            }
-                            continue;
-                        }
-                    }
-                    d[k + 1] = d[k];
-                }
-                e[n] = d[n].inverse().unwrap();
-
-                for (k, (bucket, idx)) in core::iter::zip(&mut buckets, &mut indices)
-                    .enumerate()
-                    .rev()
-                {
-                    if let Some(idx) = idx.last().copied() {
-                        let value = if idx >> (usize::BITS - 1) == 1 {
-                            let mut val = bases[!idx];
-                            val.inner.y = -val.inner.y;
-                            val
-                        } else {
-                            bases[idx]
-                        };
-
-                        if !bucket.inner.infinity {
-                            let a = value.inner.x - bucket.inner.x;
-                            if a != BaseField::ZERO {
-                                e[k] = e[k + 1] * a;
-                            } else if value.inner.y == bucket.inner.y {
-                                e[k] = e[k + 1] * value.inner.y.double();
-                            } else {
-                                e[k] = e[k + 1];
-                            }
-                            continue;
-                        }
-                    }
-                    e[k] = e[k + 1];
-                }
-
-                let d = &d[..n];
-                let e = &e[1..];
-
-                let mut empty = true;
-                for ((&d, &e), (bucket, idx)) in core::iter::zip(
-                    core::iter::zip(d, e),
-                    core::iter::zip(&mut buckets, &mut indices),
-                ) {
-                    empty &= idx.len() <= 1;
-                    if let Some(idx) = idx.pop() {
-                        let value = if idx >> (usize::BITS - 1) == 1 {
-                            let mut val = bases[!idx];
-                            val.inner.y = -val.inner.y;
-                            val
-                        } else {
-                            bases[idx]
-                        };
-
-                        if !bucket.inner.infinity {
-                            let x1 = bucket.inner.x;
-                            let x2 = value.inner.x;
-                            let y1 = bucket.inner.y;
-                            let y2 = value.inner.y;
-
-                            let eq_x = x1 == x2;
-
-                            if eq_x && y1 != y2 {
-                                bucket.inner.infinity = true;
-                            } else {
-                                let r = d * e;
-                                let m = if eq_x {
-                                    let x1 = x1.square();
-                                    x1 + x1.double()
-                                } else {
-                                    y2 - y1
-                                };
-                                let m = m * r;
-
-                                let x3 = m.square() - x1 - x2;
-                                let y3 = m * (x1 - x3) - y1;
-                                bucket.inner.x = x3;
-                                bucket.inner.y = y3;
-                            }
-                        } else {
-                            *bucket = value;
-                        }
-                    }
-                }
-
-                if empty {
-                    break;
-                }
-            }
-
-            let mut running_sum = G1::ZERO;
-            let mut res = G1::ZERO;
-            buckets.into_iter().rev().for_each(|b| {
-                running_sum.inner += b.inner;
-                res += running_sum;
-            });
-            res
-        })
-        .collect();
-
-    // We store the sum for the lowest window.
-    let lowest = *window_sums.first().unwrap();
-
-    // We're traversing windows from high to low.
-    lowest
-        + window_sums[1..]
-            .iter()
-            .rev()
-            .fold(G1::ZERO, |mut total, &sum_i| {
-                total += sum_i;
-                for _ in 0..c {
-                    total = total.double();
-                }
-                total
-            })
-}
--- a/tfhe-zk-pok/src/four_squares.rs
+++ b/tfhe-zk-pok/src/four_squares.rs
@@ -0,0 +1,308 @@
+use ark_ff::biginteger::arithmetic::widening_mul;
+use rand::prelude::*;
+
+pub fn sqr<T: Copy + core::ops::Mul>(x: T) -> T::Output {
+    x * x
+}
+
+// copied from the standard library
+// since isqrt is unstable at the moment
+pub fn isqrt(this: u128) -> u128 {
+    if this < 2 {
+        return this;
+    }
+
+    // The algorithm is based on the one presented in
+    // <https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Binary_numeral_system_(base_2)>
+    // which cites as source the following C code:
+    // <https://web.archive.org/web/20120306040058/http://medialab.freaknet.org/martin/src/sqrt/sqrt.c>.
+
+    let mut op = this;
+    let mut res = 0;
+    let mut one = 1 << (this.ilog2() & !1);
+
+    while one != 0 {
+        if op >= res + one {
+            op -= res + one;
+            res = (res >> 1) + one;
+        } else {
+            res >>= 1;
+        }
+        one >>= 2;
+    }
+
+    res
+}
+
+fn half_gcd(p: u128, s: u128) -> u128 {
+    let sq_p = isqrt(p as _);
+    let mut a = p;
+    let mut b = s;
+    while b > sq_p {
+        let r = a % b;
+        a = b;
+        b = r;
+    }
+    b
+}
+
+fn modular_inv_2_64(p: u64) -> u64 {
+    assert_eq!(p % 2, 1);
+
+    let mut old_r = p as u128;
+    let mut r = 1u128 << 64;
+
+    let mut old_s = 1u64;
+    let mut s = 0u64;
+
+    while r != 0 {
+        let q = old_r / r;
+        (old_r, r) = (r, old_r - q * r);
+
+        let q = q as u64;
+        (old_s, s) = (s, old_s.wrapping_sub(q.wrapping_mul(s)));
+    }
+
+    assert_eq!(u64::wrapping_mul(old_s, p), 1);
+    old_s
+}
+
+#[derive(Copy, Clone, Debug)]
+struct Montgomery {
+    p: u128,
+    r2: u128,
+    p_prime: u64,
+}
+
+impl Montgomery {
+    fn new(p: u128) -> Self {
+        assert_ne!(p, 0);
+        assert_eq!(p % 2, 1);
+
+        // r = 2^128
+        // we want to compute r^2 mod p
+        let r = p.wrapping_neg() % p;
+
+        let r = num_bigint::BigUint::from(r);
+        let r2 = &r * &r;
+        let r2 = r2 % p;
+        let r2_digits = &*r2.to_u64_digits();
+
+        let r2 = match *r2_digits {
+            [] => 0u128,
+            [a] => a as u128,
+            [a, b] => a as u128 | ((b as u128) << 64),
+            _ => unreachable!("value modulo 128 bit integer should have at most two u64 digits"),
+        };
+
+        let p_prime = modular_inv_2_64(p as u64).wrapping_neg();
+
+        Self { p, r2, p_prime }
+    }
+
+    fn redc(self, lo: u128, hi: u128) -> u128 {
+        let p0 = self.p as u64;
+        let p1 = (self.p >> 64) as u64;
+
+        let t0 = lo as u64;
+        let mut t1 = (lo >> 64) as u64;
+        let mut t2 = hi as u64;
+        let mut t3 = (hi >> 64) as u64;
+        let mut t4 = 0u64;
+
+        {
+            let m = u64::wrapping_mul(t0, self.p_prime);
+            let mut c = 0u64;
+
+            let x = c as u128 + t0 as u128 + widening_mul(m, p0);
+            // t0 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t1 as u128 + widening_mul(m, p1);
+            t1 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t2 as u128;
+            t2 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t3 as u128;
+            t3 = x as u64;
+            c = (x >> 64) as u64;
+
+            t4 += c;
+        }
+
+        {
+            let m = u64::wrapping_mul(t1, self.p_prime);
+            let mut c = 0u64;
+
+            let x = c as u128 + t1 as u128 + widening_mul(m, p0);
+            // t1 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t2 as u128 + widening_mul(m, p1);
+            t2 = x as u64;
+            c = (x >> 64) as u64;
+
+            let x = c as u128 + t3 as u128;
+            t3 = x as u64;
+            c = (x >> 64) as u64;
+
+            t4 += c;
+        }
+
+        let mut s0 = t2;
+        let mut s1 = t3;
+        let s2 = t4;
+
+        if !(s2 == 0 && (s1, s0) < (p1, p0)) {
+            let borrow;
+            (s0, borrow) = u64::overflowing_sub(s0, p0);
+            s1 = s1.wrapping_sub(p1).wrapping_sub(borrow as u64);
+        }
+
+        s0 as u128 | ((s1 as u128) << 64)
+    }
+
+    fn mont_from_natural(self, x: u128) -> u128 {
+        self.mul(x, self.r2)
+    }
+
+    fn natural_from_mont(self, x: u128) -> u128 {
+        self.redc(x, 0)
+    }
+
+    fn mul(self, x: u128, y: u128) -> u128 {
+        let x0 = x as u64;
+        let x1 = (x >> 64) as u64;
+        let y0 = y as u64;
+        let y1 = (y >> 64) as u64;
+
+        let lolo = widening_mul(x0, y0);
+        let lohi = widening_mul(x0, y1);
+        let hilo = widening_mul(x1, y0);
+        let hihi = widening_mul(x1, y1);
+
+        let lo = lolo;
+        let (lo, o0) = u128::overflowing_add(lo, lohi << 64);
+        let (lo, o1) = u128::overflowing_add(lo, hilo << 64);
+
+        let hi = hihi + (lohi >> 64) + (hilo >> 64) + (o0 as u128 + o1 as u128);
+
+        self.redc(lo, hi)
+    }
+
+    fn exp(self, x: u128, n: u128) -> u128 {
+        if n == 0 {
+            return 1;
+        }
+        let mut y = self.mont_from_natural(1);
+        let mut x = x;
+        let mut n = n;
+        while n > 1 {
+            if n % 2 == 1 {
+                y = self.mul(x, y);
+            }
+            x = self.mul(x, x);
+            n /= 2;
+        }
+        self.mul(x, y)
+    }
+}
+
+pub fn four_squares(v: u128) -> [u64; 4] {
+    let rng = &mut StdRng::seed_from_u64(0);
+
+    let f = v % 4;
+    if f == 2 {
+        let b = isqrt(v as _) as u64;
+
+        'main_loop: loop {
+            let x = 2 + rng.gen::<u64>() % (b - 2);
+            let y = 2 + rng.gen::<u64>() % (b - 2);
+
+            let (sum, o) = u128::overflowing_add(sqr(x as u128), sqr(y as u128));
+            if o || sum > v {
+                continue 'main_loop;
+            }
+
+            let p = v - sum;
+
+            if p == 0 || p == 1 {
+                return [0, p as u64, x, y];
+            }
+
+            if p % 4 != 1 {
+                continue 'main_loop;
+            }
+
+            let mut d = p - 1;
+            let mut s = 0u32;
+            while d % 2 == 0 {
+                d /= 2;
+                s += 1;
+            }
+            let d = d;
+            let s = s;
+
+            let mont = Montgomery::new(p);
+            let a = 2 + (rng.gen::<u128>() % (p - 3));
+
+            let mut sqrt = 0;
+            {
+                let a = mont.mont_from_natural(a);
+                let one = mont.mont_from_natural(1);
+                let neg_one = p - one;
+
+                let mut x = mont.exp(a, d);
+                let mut y = 0;
+
+                for _ in 0..s {
+                    y = mont.mul(x, x);
+                    if y == one && x != one && x != neg_one {
+                        continue 'main_loop;
+                    }
+                    if y == neg_one {
+                        sqrt = x;
+                    }
+                    x = y;
+                }
+                if y != one {
+                    continue 'main_loop;
+                }
+            }
+            if sqrt == 0 {
+                continue 'main_loop;
+            }
+
+            let i = mont.natural_from_mont(sqrt);
+            let i = if i <= p / 2 { p - i } else { i };
+            let z = half_gcd(p, i) as u64;
+            let w = isqrt(p - sqr(z as u128)) as u64;
+
+            if p != sqr(z as u128) + sqr(w as u128) {
+                continue 'main_loop;
+            }
+
+            return [x, y, z, w];
+        }
+    } else if f == 0 {
+        four_squares(v / 4).map(|x| x + x)
+    } else {
+        let mut r = four_squares(2 * v);
+        r.sort_by_key(|&x| {
+            if x % 2 == 0 {
+                -1 - ((x / 2) as i64)
+            } else {
+                (x / 2) as i64
+            }
+        });
+        [
+            (r[0] + r[1]) / 2,
+            (r[0] - r[1]) / 2,
+            (r[3] + r[2]) / 2,
+            (r[3] - r[2]) / 2,
+        ]
+    }
+}
--- a/tfhe-zk-pok/src/lib.rs
+++ b/tfhe-zk-pok/src/lib.rs
@@ -3,3 +3,5 @@ pub use ark_serialize::{CanonicalDeserialize, CanonicalSerialize, Compress, Vali
 pub mod curve_446;
 pub mod curve_api;
 pub mod proofs;
+
+mod four_squares;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Guillermo Oyarzun	154ae0288f	fix correct number of blocks per iteration	2024-09-30 16:08:05 +02:00
Guillermo Oyarzun	4fd8630623	fix encryption each iteration	2024-09-26 08:30:00 +00:00
Guillermo Oyarzun	f4c63a9ece	fix proper output	2024-09-26 07:52:16 +00:00
Guillermo Oyarzun	e5ea39a9dc	chore(gpu): add pgail gpu test	2024-09-25 18:12:41 +02:00
Arthur Meyre	197354d9b0	wip: pfail	2024-09-24 09:22:02 +00:00
Agnes Leroy	934b5f40a1	chore(gpu): add some scalar ops to dedup benchmarks	2024-09-23 14:53:13 +02:00
Nicolas Sarlin	3ff81c3c4b	test(versionable): test bounds visibility in the generated code	2024-09-23 13:28:54 +02:00
Nicolas Sarlin	bce5cd3552	chore(versionable): prepare release 0.3.0	2024-09-23 13:28:54 +02:00
Nicolas Sarlin	ec83165acc	chore(versionable): run tfhe-versionable tests in ci	2024-09-23 13:28:54 +02:00
Nicolas Sarlin	d63c2f7705	chore(versionable): update examples Mostly test in the main that the derived code actually works	2024-09-23 13:28:54 +02:00
Nicolas Sarlin	5bcc34728a	doc(versionable): adds in the README that this crate uses serde	2024-09-23 13:28:54 +02:00
Nicolas Sarlin	b62228b429	feat(versionable): Versionize Vec of tuples	2024-09-23 13:28:54 +02:00
Nicolas Sarlin	b63347336b	fix(versionable)!: wrong derived bounds in the Versionize macro Over-restrictive derived bounds were in some cases unsatisfiable, making the `versionize` method uncallable. BREAKING_CHANGE: - The `#[versionize(bound = ...)]` attribute is not needed anymore, so it has been removed.	2024-09-23 13:28:54 +02:00
Nicolas Sarlin	a631904bd1	feat(zk): add metadata to v2	2024-09-23 13:27:24 +02:00
Agnes Leroy	da850865ec	chore(gpu): add file to run full tests on H100 from workflow only	2024-09-23 13:02:17 +02:00
dependabot[bot]	8be769e282	chore(deps): bump tj-actions/changed-files from 45.0.1 to 45.0.2 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 45.0.1 to 45.0.2. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`e9772d1404...48d8f15b2a`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-09-23 11:00:51 +02:00
David Testé	47ea8bf45c	chore(deps): update slab-github-runner requirement to last version	2024-09-23 09:46:52 +02:00
Agnes Leroy	4823b8a1a0	chore(gpu): initialize some arrays to 0	2024-09-20 22:51:30 +02:00
Agnes Leroy	01f3a6d133	chore(gpu): disable slack notification for fast h100 test success	2024-09-20 17:39:52 +02:00
Nicolas Sarlin	bf613f36b3	feat(hl): impl Named for key types	2024-09-20 17:28:43 +02:00
Pedro Alves	faf200218b	chore(gpu): add checks to ensure limits for compression	2024-09-19 15:57:16 -03:00
Agnes Leroy	24088fd494	chore(gpu): add scalar div and signed scalar div to hl api Also add overflowing sub to hl	2024-09-19 19:11:45 +02:00
Agnes Leroy	48315dca80	feat(gpu): signed scalar div	2024-09-19 19:11:45 +02:00
Agnes Leroy	52b148a728	chore(gpu): temporarily set test threads to 1 for the GPU	2024-09-19 19:11:28 +02:00
Agnes Leroy	d0624d6184	chore(gpu): fix multi-gpu div performance	2024-09-19 16:56:47 +02:00
Agnes Leroy	00fc2818a9	chore(gpu): remove useless syncs	2024-09-19 16:56:47 +02:00
Titouan Tanguy	b93c23e5f8	feat(integer): add raw parts API to integer CompressionPrivateKeys	2024-09-19 14:40:42 +02:00
Nicolas Sarlin	1c59c1c260	fix(gpu): use build profile for cuda release	2024-09-19 14:40:15 +02:00
David Testé	ca7b29163e	chore(ci): add token to checkout private repo tfhe-rs-internal	2024-09-19 14:00:34 +02:00
Agnes Leroy	f7a18ddb23	chore(gpu): remove unchecked benchmarks and add ilog2 to dedup ops	2024-09-19 13:16:17 +02:00
Arthur Meyre	7b9085d0e2	feat(integer): add raw parts API to integer (De)CompressionKey	2024-09-19 11:57:50 +02:00
Arthur Meyre	d52fa249a5	feat(shortint): derive PartialEq on Compression and Decompression keys	2024-09-19 11:57:50 +02:00
Arthur Meyre	35e7031751	feat: add raw parts API for CompressedCiphertextList in HL API	2024-09-19 11:57:50 +02:00
Arthur Meyre	d9662daea5	doc(shortint): add some information about expand and the casting_mode used	2024-09-19 10:29:05 +02:00
Arthur Meyre	32cdb0b5a0	fix: expand_with_key was not providing the safest set of modes - it meant that lists needing unpacking could crash during expand	2024-09-19 10:29:05 +02:00
Agnes Leroy	a6aa95ce2d	fix(gpu): fix comparisons	2024-09-18 21:18:53 +02:00
Arthur Meyre	97d7ed9ec2	chore(ci): only notify for most things on failure	2024-09-18 17:41:24 +02:00
Nicolas Sarlin	07045f1137	chore: update tfhe to 0.8.0-alpha.8 / cuda-backend to 0.4.0-alpha.1	2024-09-18 15:50:00 +02:00
David Testé	3ab7f49436	chore(ci): remove support for slab calls with issue comments Now all workflows use Slab GitHub Action and thus can be launched directly with a workflow_dispatch event.	2024-09-18 13:42:17 +02:00
Pedro Alves	040e28d822	chore(gpu): downgrade compression conversion tests to become doc tests	2024-09-18 08:35:06 -03:00
Pedro Alves	a113674c82	feat(gpu): implement conversion from CompressedCiphertextList to CudaCompressedCiphertextList	2024-09-18 08:35:06 -03:00
Pedro Alves	1d06691dda	feat(gpu): implement conversion from CudaCompressedCiphertextList to CompressedCiphertextList	2024-09-18 08:35:06 -03:00
Guillermo Oyarzun	fc21804f3e	feat(gpu): generate and apply many luts	2024-09-18 11:58:22 +02:00
Arthur Meyre	c0878f1600	chore: bump version to 0.8.0-alpha.7	2024-09-17 13:59:32 +02:00
Arthur Meyre	97f1277e06	feat: allow to verify a proof without expanding it	2024-09-17 13:59:32 +02:00
aquint-zama	e1dd4ba4bf	chore: ensure actions are pinned by commit hash	2024-09-16 18:08:26 +02:00
David Testé	d96a368b37	chore(bench): fix display name for unchecked bitwise operations	2024-09-16 15:14:54 +02:00
Agnes Leroy	47c8d4cf64	chore(gpu): set test threads to 1 when BIG_INSTANCE is false to get a better view of failures in the ci	2024-09-16 13:19:48 +02:00
Agnes Leroy	9633b61298	fix(gpu): add missing synchronize in scalar add, refactor scalar add on cuda side	2024-09-16 09:05:16 +02:00
Agnes Leroy	8299e1cb9a	chore(gpu): change multi-gpu tests to run on rtx so it's cheaper	2024-09-16 09:04:56 +02:00
tmontaigu	72ad76b5e7	fix(integer): do sum by safe chunk sizes Parameters are made with with assumptions on the number of leveled add/sub/scalar_mul operations are made, so that the noise level before doing a PBS has a correct level and everything is safe, secure and correct. So the lib implementation has to uphold these assumptions in order to keep the error probability failure correct. In the comparisons, at some point we had a vector of ciphertexts with a degree == 1, so we greedily summed them (e.g with 2_2 params we summed them by chunks of 15), while it is correct with regards to the carry and message space it is however less correct with regards to the noise level. Noise wise, doing this huge sum is correct as long as the noise of each ciphertext is independent from the others in the same chunk. While it may generally be the case we are in, its not guaranteed, and since we do not track that information we have to take the safer approach of assuming the worst case: all noise are dependent. So to fix the issue we compute the correct size of sum chunk by also taking into account the max noise level.	2024-09-13 15:55:17 +02:00
Arthur Meyre	0e6423820f	feat(tfhe): add possibility to expand a ciphertext without verifying it	2024-09-13 14:59:21 +02:00
Arthur Meyre	c45ee6a236	chore(wasm): add missing (?) wasm_bindgen annotation	2024-09-13 14:59:21 +02:00
Arthur Meyre	cf7b21f1af	chore(integer): fix an error message string referring to shortint	2024-09-13 14:59:21 +02:00
Arthur Meyre	f9026f1563	feat(zk): recompute big d in zk v1 to be more efficient when k < k_max	2024-09-13 14:21:00 +02:00
Nicolas Sarlin	95ab73cbaa	chore(zk): add some comments to the zk pke v2 proof	2024-09-13 13:01:30 +02:00
Arthur Meyre	35faaef431	chore: bump version to 0.8.0-alpha.6	2024-09-13 10:25:03 +02:00
Arthur Meyre	a2ae1a4440	feat(zk): manage D as an upper bound as in the report - allows to prove less slots than what the CRS can handle	2024-09-13 10:24:32 +02:00
David Testé	077d5727da	chore(bench): make compression benchmarks available for database	2024-09-13 10:04:51 +02:00
Agnes Leroy	8314e7d47c	chore(gpu): return if chunk_size is 0	2024-09-12 17:26:13 +02:00
Agnes Leroy	9dca245946	fix(gpu): return early in sum_ct if num radix is 2, pass different pointers to smart copy	2024-09-12 17:26:13 +02:00
Agnes Leroy	345f25c5c3	chore(gpu): fix partial sum ct with 0 or 1 inputs in the vec Also refactor the interface for Hillis & Steele prefix sum	2024-09-12 17:26:13 +02:00
tmontaigu	c6756748f7	feat(integer): improve comparison algorithm Use subtraction to do comparisons lt/le/gt/ge	2024-09-12 15:48:02 +02:00
Mayeul@Zama	bd21971c84	chore(all): fix new warnings in doctests	2024-09-12 14:20:38 +02:00
Mayeul@Zama	e96ad74006	chore(all): enable all warnings in doctests	2024-09-12 14:20:38 +02:00
Mayeul@Zama	abd87a0f0c	chore(integer): remove #![allow(dead_code)]	2024-09-12 14:20:38 +02:00
Arthur Meyre	3875c97574	chore(ci): remove the usage of allow attributes with "reason" - this is a bandaid fix to be able to publish	2024-09-12 11:34:08 +02:00
Agnes Leroy	6fabe6bab0	chore(gpu): fix templates and refactor radix negation	2024-09-12 09:21:54 +02:00
Arthur Meyre	91171c738d	chore: bump version of tfhe to 0.8.0-alpha.5	2024-09-11 18:06:25 +02:00
Arthur Meyre	7bf0dc157d	chore: bump tfhe-zk-pok version to 0.3.0-alpha.1	2024-09-11 18:06:25 +02:00
Arthur Meyre	0612ef5be5	feat(integer): plug metadata into lower level ZK APIs	2024-09-11 18:06:25 +02:00
Arthur Meyre	aee4c1ed18	feat(shortint): plug metadata API in the lower level ZK APIs	2024-09-11 18:06:25 +02:00
Arthur Meyre	e2a3ef151a	feat(core): plug metadata into ZK APIs	2024-09-11 18:06:25 +02:00
Arthur Meyre	6f77bea5e0	feat(zk): add metadata management to v1 - proof function takes an additional u8 slice which is hashed in the proof the verification cannot happen without the same metadata being provided again	2024-09-11 18:06:25 +02:00
Arthur Meyre	e4f72dab30	chore(ci): make a check for wasm bindings with and without zk-pok	2024-09-11 18:06:25 +02:00
Arthur Meyre	7ed3fded4a	chore(ci): the detect handles option from jest is freezing the runner - trying to find the cause is making the problem worse, reverting	2024-09-11 17:25:40 +02:00
David Testé	488c942a3a	refactor(shortint): move parameters set to their own directory This is done to ease automatic parameters updates.	2024-09-11 13:54:23 +02:00
Mayeul@Zama	c0d98394fa	refactor(integer): add compression key types	2024-09-11 13:53:04 +02:00
Mayeul@Zama	93ff6992e2	refactor(all): refactor oprf integer and hl APIs	2024-09-11 10:49:39 +02:00
Pedro Alves	2a4026c761	fix(gpu): fix some edge-cases (and booleans) on compression	2024-09-10 23:11:20 +02:00
Pedro Alves	39c424b14d	chore(gpu): add debug/release modes	2024-09-09 14:02:10 +02:00
Guillermo Oyarzun	46a7a3b43b	refactor(gpu): avoid synchronizations in the keybundle	2024-09-09 14:01:15 +02:00
Mayeul@Zama	38b5759e88	chore(all): fix new lints	2024-09-09 11:57:45 +02:00
Mayeul@Zama	d6f8e59394	chore(all): update toolchain	2024-09-09 11:57:45 +02:00
dependabot[bot]	a95db07003	chore(deps): bump tj-actions/changed-files from 45.0.0 to 45.0.1 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 45.0.0 to 45.0.1. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`40853de9f8...e9772d1404`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-09-09 11:05:21 +02:00
David Testé	6544e6f6a3	chore(ci): use python script to send benchmark results Using this script simplify writing of corresponding workflow step. Moreover, now when an upload fails it translate into a workflow failure.	2024-09-09 11:04:06 +02:00
Agnes Leroy	1d549dfd8a	chore(gpu): pass over all cuda bind	2024-09-06 17:47:59 +02:00
Arthur Meyre	019548daa5	chore(ci): add a flag to jest to indicate what might be stuck when running	2024-09-06 17:41:22 +02:00
Arthur Meyre	26b666955a	chore(ci): timeout wasm bench and test at the GitHub runner level - avoids a stuck runner for 6 hours - actions timeouts are slightly larger than the test runner timeout to have a chance to get a log out	2024-09-06 17:41:22 +02:00
Arthur Meyre	ce9da12e65	feat(zk): implement faster pke proof - original work by Sarah El kazdadi co-authored-by: sarah el kazdadi <sarah.elkazdadi@zama.ai>	2024-09-06 14:25:57 +02:00
Arthur Meyre	32b45ac4bc	chore(js): increase timeout for ZK test as it can be surpassed - this seemed to cause the test runner to hang forever - also add a timeout in the GitHub workflow, to avoid having the test runner wait forever (or in this case 6 hours because of default timeout)	2024-09-06 14:19:07 +02:00
Arthur Meyre	26055b236e	feat(tfhe): allow unpacking packed compact ciphertext lists in js/wasm	2024-09-06 14:19:07 +02:00
Agnes Leroy	ce9e355c15	chore(gpu): reduce the amount of weekly multi-gpu bench	2024-09-06 11:55:34 +02:00
tmontaigu	85cc638c62	chore(gpu): fix bad merge	2024-09-06 10:21:00 +02:00
Agnes Leroy	d454b5386b	chore(gpu): remove device synchronization in drop for CudaVec	2024-09-05 14:13:06 +02:00
tmontaigu	426f3bd192	feat(hlapi): add tag system Tag The `Tag` allows to store bytes alongside of entities (keys, and ciphertext) the main purpose of this system is to `tag` / identify ciphertext with their keys. * When encrypted, a ciphertext gets the tag of the key used to encrypt it. * Ciphertexts resulting from operations (add, sub, etc.) get the tag from the ServerKey used * PublicKey gets its tag from the ClientKey that was used to create it * ServerKey gets its tag from the ClientKey that was used to create it User can change the tag of any entities at any point. BREAKING CHANGE: Many of the into_raw_parts and from_raw_parts changed to accommodate the addition of the `tag``	2024-09-05 10:32:35 +02:00
tmontaigu	4c707e79d8	feat(hlapi): bind cuda's trailing/leading_ones/zeros, ilog2	2024-09-04 19:38:14 +02:00
Arthur Meyre	e1afb8126d	chore: bump version to 0.8.0-alpha.4	2024-09-04 17:30:43 +02:00
Agnes Leroy	0d1ef0af7e	chore(gpu): add ilog2 bench	2024-09-04 17:03:20 +02:00
Arthur Meyre	15e3474cda	feat(pbs): slightly improve f64 pbs perf co-authored-by: sarah el kazdadi <sarah.elkazdadi@zama.ai>	2024-09-03 19:31:14 +02:00
Arthur Meyre	10be6f9423	chore(ci): update node project packages	2024-09-03 17:14:36 +02:00
David Testé	c521c2ca2e	chore(ci): avoid running integer tests on push to internal repo	2024-09-03 15:29:15 +02:00
David Testé	39c46056f6	chore(ci): rename benchmark workflows to ease file navigation	2024-09-03 10:34:14 +02:00
Pedro Alves	aa2b27460c	fix(gpu): update the internal benchmark tool for the TBC pbs	2024-09-02 13:16:18 +02:00