refactor(gpu): random test fft

fix(gpu): fix gpu index in casts, scalar comparison, scalar mul, etc.
feat(gpu): Implement propagate_single_carry_get_input_carries
2026-04-28 03:01:21 -04:00 · 2024-06-27 10:10:13 +02:00 · 2024-06-27 10:08:11 +02:00 · 2024-06-26 17:34:28 +02:00 · 2024-06-26 16:47:29 +02:00 · 2024-06-26 11:30:05 +02:00
574 changed files with 33564 additions and 28086 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -3,6 +3,7 @@ self-hosted-runner:
  labels:
    - m1mac
    - 4090-desktop
+    - large_windows_16_latest
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/workflows/approve_label.yml
+++ b/.github/workflows/approve_label.yml
@@ -1,5 +1,5 @@
-# Manage approved label in pull request
-name: PR approved label manager
+# Add labels in pull request
+name: PR label manager

 on:
  pull_request:
@@ -27,7 +27,9 @@ jobs:
      # Add label only if the review is approved and if the label doesn't already exist
      - name: Add approved label
        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
-        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
+        if: ${{ github.event_name == 'pull_request_review' 
+          && github.event.review.state == 'approved'
+          && !contains(fromJSON(env.LABELS), 'approved') }}
        with:
          # We need to use a PAT to be able to trigger `labeled` event for the other workflow.
          github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,17 +44,23 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

+      - name: Install git-lfs
+        run: |
+          sudo apt update && sudo apt -y install git-lfs
+
      - name: Run concrete-csprng tests
        run: |
          make test_concrete_csprng
@@ -107,6 +113,17 @@ jobs:
        run: |
          make test_safe_deserialization

+      - name: Clone test data
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/tfhe-backward-compat-data
+          path: tfhe/tfhe-backward-compat-data
+          lfs: 'true'
+
+      - name: Run backward compatibility tests
+        run: |
+          make test_backward_compatibility_ci
+
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
@@ -123,7 +140,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -16,7 +16,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [labeled]
+    types: [ labeled ]

 jobs:
  cuda-tests-linux:
@@ -29,10 +29,12 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,14 +55,16 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -121,14 +123,16 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -183,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -10,6 +10,9 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  # We clear the cache to reduce memory pressure because of the numerous processes of cargo
+  # nextest
+  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,14 +48,16 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -88,7 +93,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_multi_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_multi_gpu_tests.yml
@@ -0,0 +1,134 @@
+# Compile and test tfhe-cuda-backend on an AWS instance
+name: TFHE Cuda Backend - Full tests multi-GPU
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-tests-multi-gpu)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: multi-gpu-test
+
+  cuda-tests-linux:
+    name: CUDA multi-GPU tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-tests-multi-gpu)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -10,6 +10,9 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  # We clear the cache to reduce memory pressure because of the numerous processes of cargo
+  # nextest
+  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,7 +30,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,14 +48,16 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -92,7 +97,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -24,6 +24,8 @@ on:
 jobs:
  should-run:
    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
      pull-requests: write
    outputs:
@@ -55,13 +57,13 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@5e85e31a0187e8df23b438284aa04f21b55f1510
+        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -127,7 +129,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -147,14 +149,16 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -231,7 +235,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,14 +45,16 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -70,7 +72,7 @@ jobs:

      - name: Run parallel wasm tests
        run: |
-          make ci_test_web_js_api_parallel
+          make test_web_js_api_parallel_ci

      - name: Slack Notification
        if: ${{ always() }}
@@ -88,7 +90,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -103,7 +103,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -19,11 +19,11 @@ jobs:

    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest-large, windows-latest]
+        os: [ubuntu-latest, macos-latest-large, large_windows_16_latest]
      fail-fast: false

    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Install and run newline linter checks
        if: matrix.os == 'ubuntu-latest'
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -10,7 +10,7 @@ jobs:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
        with:
-          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\(\w+\))?\:) .+$'
+          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\:) .+$'
          flags: "gs"
          error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
          excludeDescription: "true" # optional: this excludes the description body of a pull request
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Get actionlint
        run: |
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -6,70 +6,58 @@ env:
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string
+  # Code coverage workflow is only run via workflow_dispatch event since execution duration is not stabilized yet.

 jobs:
-  code-coverage:
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
-    timeout-minutes: 11520 # 8 days
+  setup-instance:
+    name: Setup instance (code-coverage)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  code-coverage:
+    name: Code coverage tests
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.event_name }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 5760 # 4 days
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@03334d095e2739fa9ac4034ec16f66d5d01e9eba
+        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
        with:
          files_yaml: |
            tfhe:
@@ -99,7 +87,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
+        uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -113,7 +101,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
+        uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -127,8 +115,28 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (code-coverage)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, code-coverage ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -63,13 +63,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

      - name: Run benchmarks with AVX512
        run: |
          make bench_pbs
+          make bench_pbs128
          make bench_ks

      - name: Parse results
@@ -94,7 +95,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -20,13 +20,14 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-core-crypto-benchmarks)
    runs-on: ubuntu-latest
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,16 +56,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-          echo \
-          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -73,7 +65,7 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -91,7 +83,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -142,7 +134,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -165,7 +157,7 @@ jobs:
  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-core-crypto-benchmarks ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: ubuntu-latest
    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
@@ -183,7 +175,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -17,7 +17,6 @@ on:
  pull_request:
    types: [ labeled ]

-
 jobs:
  setup-instance:
    name: Setup instance (csprng-randomness-tests)
@@ -28,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -46,14 +45,16 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -77,7 +78,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -27,7 +27,7 @@ jobs:
    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
-      cancel-in-progress: true
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ["self-hosted", "4090-desktop"]
    timeout-minutes: 1440 # 24 hours
    strategy:
@@ -39,7 +39,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -52,12 +52,12 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -114,13 +114,13 @@ jobs:
    needs: cuda-integer-benchmarks
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
-      cancel-in-progress: true
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ["self-hosted", "4090-desktop"]
    timeout-minutes: 1440 # 24 hours

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -133,12 +133,12 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/hyperstack_tfhe_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_gpu_tests.yml
@@ -0,0 +1,164 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Full tests on H100
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-h100-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-tests-linux:
+    name: CUDA H100 tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo \
+          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+           $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name:
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-h100-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -97,7 +97,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -74,7 +74,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -92,12 +92,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -23,13 +23,14 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-integer-benchmarks)
    runs-on: ubuntu-latest
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    if:  github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -58,16 +59,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-          echo \
-          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -76,7 +68,7 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -94,7 +86,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -154,7 +146,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -177,7 +169,7 @@ jobs:
  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-benchmarks ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: ubuntu-latest
    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
@@ -195,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -22,13 +22,14 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-integer-full-benchmarks)
    runs-on: ubuntu-latest
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,16 +63,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-          echo \
-          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -80,7 +72,7 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -98,7 +90,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -123,7 +115,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -170,7 +162,7 @@ jobs:
  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-full-benchmarks ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: ubuntu-latest
    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
@@ -188,7 +180,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -97,7 +97,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -23,13 +23,14 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-integer-multi-bit-benchmarks)
    runs-on: ubuntu-latest
-    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -59,16 +60,7 @@ jobs:
      - name: Install dependencies
        run: |
          sudo apt update
-          sudo apt install ca-certificates curl
-          sudo install -m 0755 -d /etc/apt/keyrings
-          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
-          sudo chmod a+r /etc/apt/keyrings/docker.asc
-          echo \
-          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
          cd cmake-${{ env.CMAKE_VERSION }}
@@ -77,7 +69,7 @@ jobs:
          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -95,7 +87,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -155,7 +147,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -179,7 +171,7 @@ jobs:
  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    runs-on: ubuntu-latest
    if: ${{ !success() && !cancelled() }}
    continue-on-error: true
    steps:
@@ -197,7 +189,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -0,0 +1,181 @@
+# Run 64-bit multi-bit integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Integer multi GPU Multi-bit benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: multi-gpu-test
+
+  cuda-integer-multi-bit-multi-gpu-benchmarks:
+    name: Execute multi GPU integer multi-bit benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run multi-bit benchmarks with AVX512
+        run: |
+          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "p3.8xlarge" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_integer
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}
+          SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_multi_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -0,0 +1,185 @@
+# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Integer multi GPU full benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-integer-full-multi-gpu-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: multi-gpu-test
+
+  cuda-integer-full-multi-gpu-benchmarks:
+    name: Execute multi GPU integer benchmarks for all operations flavor
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        command: [integer, integer_multi_bit]
+        op_flavor: [default, unchecked]
+        # explicit include-based build matrix, of known valid options
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "p3.8xlarge" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-multi-gpu-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -3,7 +3,7 @@ name: Tests on M1 CPU
 on:
  workflow_dispatch:
  pull_request:
-    types: [labeled]
+    types: [ labeled ]
  # Have a nightly build for M1 tests
  schedule:
    # * is a special character in YAML so you have to quote this string
@@ -18,6 +18,9 @@ env:
  RUST_MIN_STACK: "8388608"
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  FAST_TESTS: "TRUE"
+  # We clear the cache to reduce memory pressure because of the numerous processes of cargo
+  # nextest
+  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -31,10 +34,12 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          persist-credentials: 'false'

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -20,9 +20,14 @@ on:
        description: "Push node js package"
        type: boolean
        default: true
+      npm_latest_tag:
+        description: "Set NPM tag as latest"
+        type: boolean
+        default: false

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  NPM_TAG: ""

 jobs:
  publish_release:
@@ -30,10 +35,15 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

+      - name: Create NPM version tag
+        if: ${{ inputs.npm_latest_tag }}
+        run: |
+          echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
+
      - name: Publish crate.io package
        if: ${{ inputs.push_to_crates }}
        env:
@@ -45,7 +55,7 @@ jobs:
      - name: Build web package
        if: ${{ inputs.push_web_package }}
        run: |
-          make build_web_js_api
+          make build_web_js_api_parallel

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
@@ -54,6 +64,7 @@ jobs:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}

      - name: Build Node package
        if: ${{ inputs.push_node_package }}
@@ -70,6 +81,7 @@ jobs:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -54,7 +54,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: stable

@@ -112,7 +112,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -17,10 +17,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Checkout lattice-estimator
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -45,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -55,7 +55,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -95,7 +95,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -71,12 +71,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -97,7 +97,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -52,7 +52,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -70,12 +70,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

@@ -97,7 +97,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -36,10 +36,6 @@ on:
        description: "Run core crypto benches"
        type: boolean
        default: true
-      wasm_client_bench:
-        description: "Run WASM client benches"
-        type: boolean
-        default: true

 jobs:
  start-benchmarks:
@@ -49,17 +45,17 @@ jobs:
        command: [ boolean_bench, shortint_bench,
                   integer_bench, integer_multi_bit_bench,
                   signed_integer_bench, signed_integer_multi_bit_bench,
-                   core_crypto_bench, wasm_client_bench ]
+                   core_crypto_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@03334d095e2739fa9ac4034ec16f66d5d01e9eba
+        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
        with:
          files_yaml: |
            common_benches:
@@ -101,12 +97,9 @@ jobs:
              - tfhe/src/core_crypto/**
              - tfhe/benches/core_crypto/**
              - .github/workflows/core_crypto_benchmark.yml
-            wasm_client_bench:
-              - tfhe/web_wasm_parallel_tests/**
-              - .github/workflows/wasm_client_benchmark.yml

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -30,12 +30,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
      - name: git-sync
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -1,32 +1,14 @@
-# Run WASM client benchmarks on an AWS instance and return parsed results to Slab CI bot.
+# Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
 name: WASM client benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
+  push:
+    branches:
+      - main
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
@@ -34,56 +16,105 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-wasm-client-benchmarks:
-    name: Execute WASM client benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  should-run:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'workflow_dispatch' ||
+      ((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
+    permissions:
+      pull-requests: write
+    outputs:
+      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0

+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            wasm_bench:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - tfhe/web_wasm_parallel_tests/**
+              - .github/workflows/wasm_client_benchmark.yml
+
+  setup-instance:
+    name: Setup instance (wasm-client-benchmarks)
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  wasm-client-benchmarks:
+    name: Execute WASM client benchmarks
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
        with:
          toolchain: nightly

      - name: Run benchmarks
        run: |
          make install_node
-          make ci_bench_web_js_api_parallel
+          make bench_web_js_api_parallel_ci

      - name: Parse results
        run: |
          make parse_wasm_benchmarks
-
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py tfhe/wasm_pk_gen.csv ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
+          --hardware "m6i.4xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --key-gen

@@ -104,7 +135,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          repository: zama-ai/slab
          path: slab
@@ -130,8 +161,28 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "WASM benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (wasm-client-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, wasm-client-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/zk_pke_benchmark.yml
+++ b/.github/workflows/zk_pke_benchmark.yml
@@ -0,0 +1,199 @@
+# Run PKE Zero-Knowledge benchmarks on an instance and return parsed results to Slab CI bot.
+name: PKE ZK benchmarks
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 3a.m.
+    - cron: '0 3 * * 6'
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@eaf854ef0c266753e1abec356dcf17d92695b251
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            zk_pok:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+              - tfhe/src/integer/**
+              - tfhe/src/zk.rs
+              - tfhe/benches/integer/zk_pke.rs
+              - .github/workflows/zk_pke_benchmark.yml
+
+  setup-instance:
+    name: Setup instance (pke-zk-benchmarks)
+    runs-on: ubuntu-latest
+    needs: should-run
+    if: github.event_name != 'push' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'push' &&
+      github.repository == 'zama-ai/tfhe-rs' &&
+      needs.should-run.outputs.zk_pok_changed == 'true')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  pke-zk-benchmarks:
+    name: Execute PKE ZK benchmarks
+    if: github.event_name != 'push' ||
+      ((github.event_name == 'push' || github.event_name == 'schedule') &&
+      needs.setup-instance.result != 'skipped')
+    needs: [ should-run, setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@21dc36fb71dd22e3317045c0c31a3f4249868b17
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make bench_integer_zk
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "hpc7a.96xlarge" \
+          --backend cpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Parse CRS sizes results
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe/pke_zk_crs_sizes.csv ${{ env.RESULTS_FILENAME }} \
+          --key-sizes \
+          --append-results
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_integer_zk
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ !success() && !cancelled() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "PKE ZK benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (pke-zk-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, pke-zk-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1d4b7b7540118af5f96ac16a1dc4cfd9c5929dc8
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (pke-zk-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,9 @@ dieharder_run.log

 # Cuda local build
 backends/tfhe-cuda-backend/cuda/cmake-build-debug/
+
+# WASM tests
+tfhe/web_wasm_parallel_tests/server.PID
+
+# Dir used for backward compatibility test data
+tfhe/tfhe-backward-compat-data/
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,6 +7,11 @@ members = [
    "apps/trivium",
    "concrete-csprng",
    "backends/tfhe-cuda-backend",
+    "utils/tfhe-versionable",
+    "utils/tfhe-versionable-derive"
+]
+exclude = [
+    "tfhe/backward_compatibility_tests"
 ]

 [profile.bench]
--- a/77
+++ b/77
@@ -19,6 +19,8 @@ FAST_BENCH?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
 NODE_VERSION=20
 FORWARD_COMPAT?=OFF
+BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
+BACKWARD_COMPAT_DATA_DIR=tfhe-backward-compat-data
 # sed: -n, do not print input stream, -e means a script/expression
 # 1,/version/ indicates from the first line, to the line matching version at the start of the line
 # p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
@@ -273,7 +275,7 @@ clippy_c_api: install_rs_check_toolchain
 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api \
+		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
@@ -289,7 +291,7 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
@@ -368,21 +370,21 @@ symlink_c_libs_without_fingerprint:
 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,$(FORWARD_COMPAT_FEATURE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,gpu \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,gpu \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

@@ -391,7 +393,7 @@ build_web_js_api: install_rs_build_toolchain install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok

 .PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
 build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
@@ -399,7 +401,7 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
 	rustup component add rust-src --toolchain $(RS_CHECK_TOOLCHAIN) && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
 		wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok-experimental \
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
 		-Z build-std=panic_abort,std

 .PHONY: build_node_js_api # Build the js API targeting nodejs
@@ -407,7 +409,7 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=nodejs \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok

 .PHONY: build_concrete_csprng # Build concrete_csprng
 build_concrete_csprng: install_rs_build_toolchain
@@ -417,10 +419,10 @@ build_concrete_csprng: install_rs_build_toolchain
 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental -p $(TFHE_SPEC) -- core_crypto::
+		--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
 	fi

 .PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -459,7 +461,7 @@ test_core_crypto_gpu: install_rs_build_toolchain
 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

@@ -591,7 +593,7 @@ test_integer_cov: install_rs_check_toolchain install_tarpaulin
 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok -p $(TFHE_SPEC) \
 		-- high_level_api::

 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
@@ -602,14 +604,14 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok-experimental \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p $(TFHE_SPEC) \
 		-- test_user_docs::

 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok-experimental -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
 		-- test_user_docs::

 .PHONY: test_fhe_strings # Run tests for fhe_strings example
@@ -648,18 +650,31 @@ test_concrete_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng

-.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
+.PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok

+.PHONY: test_versionable # Run tests for tfhe-versionable subcrate
+test_versionable: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		-p tfhe-versionable
+
+.PHONY: test_backward_compatibility_ci
+test_backward_compatibility_ci: install_rs_build_toolchain
+	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
+
+.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
+test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
+
 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
 	@# Even though we are not in docs.rs, this allows to "just" build the doc
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental --no-deps -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)

 .PHONY: docs # Build rust doc alias for doc
 docs: doc
@@ -670,7 +685,7 @@ lint_doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental -p $(TFHE_SPEC) --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -741,8 +756,8 @@ test_nodejs_wasm_api: build_node_js_api
 test_web_js_api_parallel: build_web_js_api_parallel
 	$(MAKE) -C tfhe/web_wasm_parallel_tests test

-.PHONY: ci_test_web_js_api_parallel # Run tests for the web wasm api
-ci_test_web_js_api_parallel: build_web_js_api_parallel
+.PHONY: test_web_js_api_parallel_ci # Run tests for the web wasm api
+test_web_js_api_parallel_ci: build_web_js_api_parallel
 	source ~/.nvm/nvm.sh && \
 	nvm install $(NODE_VERSION) && \
 	nvm use $(NODE_VERSION) && \
@@ -809,6 +824,14 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

+.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
+bench_integer_zk: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-pke-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,zk-pok,nightly-avx512 \
+	-p $(TFHE_SPEC) --
+
 .PHONY: bench_shortint # Run benchmarks for shortint
 bench_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
@@ -847,6 +870,12 @@ bench_pbs: install_rs_check_toolchain
 	--bench pbs-bench \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

+.PHONY: bench_pbs128 # Run benchmarks for PBS using FFT 128 bits
+bench_pbs128: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs128-bench \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+
 .PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
 bench_pbs_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -869,8 +898,8 @@ bench_ks_gpu: install_rs_check_toolchain
 bench_web_js_api_parallel: build_web_js_api_parallel
 	$(MAKE) -C tfhe/web_wasm_parallel_tests bench

-.PHONY: ci_bench_web_js_api_parallel # Run benchmarks for the web wasm api
-ci_bench_web_js_api_parallel: build_web_js_api_parallel
+.PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
+bench_web_js_api_parallel_ci: build_web_js_api_parallel
 	source ~/.nvm/nvm.sh && \
 	nvm use node && \
 	$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci
@@ -929,6 +958,12 @@ write_params_to_file: install_rs_check_toolchain
 	--example write_params_to_file \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache

+.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
+clone_backward_compat_data:
+	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) tfhe/$(BACKWARD_COMPAT_DATA_DIR)
+
+tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
+
 #
 # Real use case examples
 #
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ production-ready library for all the advanced features of TFHE.
 <br></br>

 ## Table of Contents
- **[Getting Started](#getting-started)**
+- **[Getting started](#getting-started)**
   - [Cargo.toml configuration](#cargotoml-configuration)
   - [A simple example](#a-simple-example)
 - **[Resources](#resources)**
@@ -65,7 +65,7 @@ production-ready library for all the advanced features of TFHE.
 - **[Support](#support)**
 <br></br>

-## Getting Started
+## Getting started

 ### Cargo.toml configuration
 To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
@@ -198,7 +198,7 @@ Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-

 ### Disclaimers

-#### Security Estimation
+#### Security estimation

 Security estimations are done using the
 [Lattice Estimator](https://github.com/malb/lattice-estimator)
@@ -206,13 +206,13 @@ with `red_cost_model = reduction.RC.BDGL16`.

 When a new update is published in the Lattice Estimator, we update parameters accordingly.

-### Security Model
+### Security model

 The default parameters for the TFHE-rs library are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at p_error = $2^{-40}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [1]. 

 [1] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf

-#### Side-Channel Attacks
+#### Side-channel attacks

 Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
 and will be released in upcoming versions.
@@ -241,7 +241,23 @@ Becoming an approved contributor involves signing our Contributor License Agreem
 <br></br>

 ### License
-This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
+This software is distributed under the **BSD-3-Clause-Clear** license. Read [this](LICENSE) for more details.
+
+#### FAQ
+**Is Zama’s technology free to use?**
+>Zama’s libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zama’s commercial patent license.
+>
+>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).
+
+**What do I need to do if I want to use Zama’s technology for commercial purposes?**
+>To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.ai for more information.
+
+**Do you file IP on your technology?**
+>Yes, all Zama’s technologies are patented.
+
+**Can you customize a solution for my specific use case?**
+>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.ai.
+
 <p align="right">
  <a href="#about" > ↑ Back to top </a> 
 </p>
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -13,7 +13,7 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
@@ -63,7 +63,7 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
@@ -108,7 +108,7 @@ pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -13,7 +13,7 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
@@ -63,7 +63,7 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
@@ -108,7 +108,7 @@ pub fn trivium_shortint_trans(c: &mut Criterion) {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -224,7 +224,7 @@ fn kreyvium_test_shortint_long() {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -360,7 +360,7 @@ fn trivium_test_shortint_long() {
    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);

    let ksk = KeySwitchingKey::new(
-        (&client_key, &server_key),
+        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.2.0"
+version = "0.3.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -14,6 +14,3 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
 [build-dependencies]
 cmake = { version = "0.1" }
 pkg-config = { version = "0.3" }
-
-[dependencies]
-thiserror = "1.0"
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -9,6 +9,11 @@ fn main() {
    }

    println!("Build tfhe-cuda-backend");
+    println!("cargo::rerun-if-changed=cuda/include");
+    println!("cargo::rerun-if-changed=cuda/src");
+    println!("cargo::rerun-if-changed=cuda/tests_and_benchmarks");
+    println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
+    println!("cargo::rerun-if-changed=src");
    if env::consts::OS == "linux" {
        let output = Command::new("./get_os_name.sh").output().unwrap();
        let distribution = String::from_utf8(output.stdout).unwrap();
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -6,6 +6,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <cuda_runtime.h>
+#include <vector>

 #define synchronize_threads_in_block() __syncthreads()
 extern "C" {
--- a/backends/tfhe-cuda-backend/cuda/include/helper.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper.h
@@ -1,10 +0,0 @@
-#ifndef HELPER_H
-#define HELPER_H
-
-extern "C" {
-int cuda_setup_multi_gpu();
-}
-
-void multi_gpu_checks(uint32_t gpu_count);
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -0,0 +1,18 @@
+#ifndef HELPER_MULTI_GPU_H
+#define HELPER_MULTI_GPU_H
+#include <mutex>
+
+extern std::mutex m;
+extern bool p2p_enabled;
+
+extern "C" {
+int cuda_setup_multi_gpu();
+}
+
+int get_active_gpu_count(int num_inputs, int gpu_count);
+
+int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
+
+int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -9,13 +9,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset = 0);

 void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset = 0);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -51,7 +51,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
                                                   uint32_t gpu_index,
@@ -76,7 +76,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -85,7 +85,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);
@@ -354,7 +354,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -364,7 +364,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 template <typename Torus, typename STorus>
 void scratch_cuda_programmable_bootstrap_tbc(
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -29,23 +29,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
-
-void scratch_cuda_generic_multi_bit_programmable_bootstrap_64(
-    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
-
-void cuda_generic_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
+    uint32_t lwe_chunk_size = 0);

 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
@@ -80,7 +65,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size);
+    uint32_t gpu_offset, uint32_t lwe_chunk_size);
 #endif

 template <typename Torus, typename STorus>
@@ -107,7 +92,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);

 template <typename Torus, typename STorus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -126,7 +111,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);

 template <typename Torus>
 __host__ __device__ uint64_t
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -11,7 +11,7 @@ set(SOURCES
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper.h)
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -9,14 +9,16 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset) {
  cuda_keyswitch_lwe_ciphertext_vector(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
      static_cast<uint32_t *>(lwe_array_in),
      static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
+      gpu_offset);
 }

 /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -39,12 +41,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset) {
  cuda_keyswitch_lwe_ciphertext_vector(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_output_indexes),
      static_cast<uint64_t *>(lwe_array_in),
      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
+      gpu_offset);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -3,8 +3,11 @@

 #include "device.h"
 #include "gadget.cuh"
+#include "helper_multi_gpu.h"
+#include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

@@ -31,65 +34,128 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 * scaling factor) under key s2 instead of s1, with an increased noise
 *
 */
+// Each thread in x are used to calculate one output.
+// threads in y are used to paralelize the lwe_dimension_in loop.
+// shared memory is used to store intermediate results of the reduction.
 template <typename Torus>
-__global__ void
-keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
-          Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
-          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
-  int tid = threadIdx.x;
+__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
+                          Torus *lwe_array_in, Torus *lwe_input_indexes,
+                          Torus *ksk, uint32_t lwe_dimension_in,
+                          uint32_t lwe_dimension_out, uint32_t base_log,
+                          uint32_t level_count, int gpu_offset) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
+
  extern __shared__ int8_t sharedmem[];
+  Torus *lwe_acc_out = (Torus *)sharedmem;
+  auto block_lwe_array_out =
+      get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
+                lwe_dimension_out + 1);
+
  if (tid <= lwe_dimension_out) {
-    Torus *local_lwe_array_out = (Torus *)sharedmem;
-    auto block_lwe_array_in = get_chunk(
-        lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
-    auto block_lwe_array_out = get_chunk(
-        lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
-    local_lwe_array_out[tid] = 0;

-    if (tid == lwe_dimension_out) {
-      local_lwe_array_out[lwe_dimension_out] =
-          block_lwe_array_in[lwe_dimension_in];
+    Torus local_lwe_out = 0;
+    auto block_lwe_array_in =
+        get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
+                  lwe_dimension_in + 1);
+
+    if (tid == lwe_dimension_out && threadIdx.y == 0) {
+      local_lwe_out = block_lwe_array_in[lwe_dimension_in];
    }
+    const Torus mask_mod_b = (1ll << base_log) - 1ll;

-    for (int i = 0; i < lwe_dimension_in; i++) {
+    const int pack_size = (lwe_dimension_in + blockDim.y - 1) / blockDim.y;
+    const int start_i = pack_size * threadIdx.y;
+    const int end_i = SEL(lwe_dimension_in, pack_size * (threadIdx.y + 1),
+                          pack_size * (threadIdx.y + 1) <= lwe_dimension_in);
+
+    // This loop distribution seems to benefit the global mem reads
+    for (int i = start_i; i < end_i; i++) {
      Torus a_i = round_to_closest_multiple(block_lwe_array_in[i], base_log,
                                            level_count);
      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
-      Torus mask_mod_b = (1ll << base_log) - 1ll;
+
      for (int j = 0; j < level_count; j++) {
        auto ksk_block =
            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-        local_lwe_array_out[tid] -= (Torus)ksk_block[tid] * decomposed;
+        local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
      }
    }
-    block_lwe_array_out[tid] = local_lwe_array_out[tid];
+
+    lwe_acc_out[shmem_index] = local_lwe_out;
+  }
+
+  if (tid <= lwe_dimension_out) {
+    for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
+         offset /= 2) {
+      __syncthreads();
+      lwe_acc_out[shmem_index] +=
+          lwe_acc_out[shmem_index + offset * blockDim.x];
+    }
+    if (threadIdx.y == 0)
+      block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
  }
 }

-/// assume lwe_array_in in the gpu
 template <typename Torus>
 __host__ void cuda_keyswitch_lwe_ciphertext_vector(
    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
    Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
    Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset = 0) {

  cudaSetDevice(gpu_index);
-  constexpr int ideal_threads = 1024;
-  if (lwe_dimension_out + 1 > ideal_threads)
-    PANIC("Cuda error (keyswitch): lwe dimension size out should be greater "
-          "or equal to the number of threads per block")

-  int lwe_size = lwe_dimension_out + 1;
-  int shared_mem = sizeof(Torus) * lwe_size;
-  dim3 grid(num_samples, 1, 1);
-  dim3 threads(ideal_threads, 1, 1);
+  constexpr int num_threads_y = 32;
+  int num_blocks, num_threads_x;
+
+  getNumBlocksAndThreads2D(lwe_dimension_out + 1, 512, num_threads_y,
+                           num_blocks, num_threads_x);
+
+  int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
+  dim3 grid(num_blocks, num_samples, 1);
+  dim3 threads(num_threads_x, num_threads_y, 1);

  keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
  check_cuda_error(cudaGetLastError());
 }

+template <typename Torus>
+void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
+                       uint32_t gpu_count, Torus *lwe_array_out,
+                       Torus *lwe_output_indexes, Torus *lwe_array_in,
+                       Torus *lwe_input_indexes, Torus **ksks,
+                       uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+                       uint32_t base_log, uint32_t level_count,
+                       uint32_t num_samples, bool sync_streams = true) {
+
+  /// If the number of radix blocks is lower than the number of GPUs, not all
+  /// GPUs will be active and there will be 1 input per GPU
+  auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
+  int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
+  if (sync_streams)
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+#pragma omp parallel for num_threads(active_gpu_count)
+  for (uint i = 0; i < active_gpu_count; i++) {
+    int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
+    int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
+
+    // Compute Keyswitch
+    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+        streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
+        lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
+        lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
+        gpu_offset);
+  }
+
+  if (sync_streams)
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+}
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -6,7 +6,7 @@
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
  cudaStream_t stream;
-  check_cuda_error(cudaStreamCreate(&stream));
+  check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return stream;
 }

@@ -47,9 +47,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
      &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));

  if (support_async_alloc) {
-    cuda_synchronize_stream(stream, gpu_index);
    check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream));
-    cuda_synchronize_stream(stream, gpu_index);
  } else {
    check_cuda_error(cudaMalloc((void **)&ptr, size));
  }
@@ -121,21 +119,22 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
    return;
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.device != gpu_index && attr_dest.type != cudaMemoryTypeDevice) {
+  if (attr_dest.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
  }
  cudaPointerAttributes attr_src;
  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.device != gpu_index && attr_src.type != cudaMemoryTypeDevice) {
+  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  if (attr_src.device != attr_dest.device) {
-    PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
-  }
-
  check_cuda_error(cudaSetDevice(gpu_index));
-  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
+  if (attr_src.device == attr_dest.device) {
+    check_cuda_error(
+        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
+                                         attr_src.device, size, stream));
+  }
 }

 /// Synchronizes device
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -294,9 +294,6 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
    __syncthreads();
  }

-  // compressed size = 8192 is actual polynomial size = 16384.
-  // from this size, twiddles can't fit in constant memory,
-  // so from here, butterfly operation access device memory.
  if constexpr (params::degree >= 8192) {
    // level 13
    tid = threadIdx.x;
@@ -307,7 +304,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
           (tid & (params::degree / 8192 - 1));
      i2 = i1 + params::degree / 8192;

-      w = negtwiddles13[twid_id];
+      w = negtwiddles[twid_id + 4096];
      u = A[i1];
      v = A[i2] * w;

@@ -351,10 +348,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  // mapping in backward fft is reversed
  // butterfly operation is started from last level

-  // compressed size = 8192 is actual polynomial size = 16384.
-  // twiddles for this size can't fit in constant memory so
-  // butterfly operation for this level access device memory to fetch
-  // twiddles
  if constexpr (params::degree >= 8192) {
    // level 13
    tid = threadIdx.x;
@@ -365,7 +358,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
           (tid & (params::degree / 8192 - 1));
      i2 = i1 + params::degree / 8192;

-      w = negtwiddles13[twid_id];
+      w = negtwiddles[twid_id + 4096];
      u = A[i1] - A[i2];

      A[i1] += A[i2];
@@ -722,4 +715,312 @@ __global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
  }
 }

+template <class params> __device__ void NSMFFT_direct_bundle(double2 *A, const double2 regs[4]) {
+
+  /* We don't make bit reverse here, since twiddles are already reversed
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance
+   */
+
+  size_t tid = threadIdx.x;
+  size_t twid_id;
+  size_t i1, i2;
+  double2 u, v, w;
+  // level 1
+  // we don't make actual complex multiplication on level1 since we have only
+  // one twiddle, it's real and image parts are equal, so we can multiply
+  // it with simpler operations
+  // degree = 1024, opt = 2 -> 
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    i1 = tid;
+    i2 = tid + params::degree / 2;
+
+    //u = A[i1];
+    //v = A[i2] * (double2){0.707106781186547461715008466854,
+    //                      0.707106781186547461715008466854};
+
+    u = regs[i];
+    v = regs[i + params::opt / 2] * (double2){0.707106781186547461715008466854,
+                          0.707106781186547461715008466854};
+    //A[i1] += v;
+    A[i1] = u + v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt; //256
+  }
+  
+  __syncthreads();
+
+  // level 2
+  // from this level there are more than one twiddles and none of them has equal
+  // real and imag parts, so complete complex multiplication is needed
+  // for each level params::degree / 2^level represents number of coefficients
+  // inside divided chunk of specific level
+  //
+#pragma unroll
+  for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+    tid = threadIdx.x + i * params::degree / params::opt;
+    twid_id = tid / (params::degree / 4);
+    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
+    i2 = i1 + params::degree / 4;
+
+    w = negtwiddles[twid_id + 2];
+    u = A[i1];
+    v = A[i2] * w;
+  
+    A[i1] += v;
+    A[i2] = u - v;
+  }
+  __syncthreads();
+
+  // level 3
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 8);
+    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
+    i2 = i1 + params::degree / 8;
+
+    w = negtwiddles[twid_id + 4];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 4
+  //tid = threadIdx.x;
+  //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+  for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+    tid = threadIdx.x + i * params::degree / params::opt;
+    twid_id = tid / (params::degree / 16);
+    i1 =
+        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
+    i2 = i1 + params::degree / 16;
+
+    w = negtwiddles[twid_id + 8];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    //tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 5
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 32);
+    i1 =
+        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
+    i2 = i1 + params::degree / 32;
+
+    w = negtwiddles[twid_id + 16];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 6
+  //tid = threadIdx.x;
+  //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+  for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+    tid = threadIdx.x + i * params::degree / params::opt;
+    twid_id = tid / (params::degree / 64);
+    i1 =
+        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
+    i2 = i1 + params::degree / 64;
+
+    w = negtwiddles[twid_id + 32];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+    //tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 7
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 128);
+    i1 = 2 * (params::degree / 128) * twid_id +
+         (tid & (params::degree / 128 - 1));
+    i2 = i1 + params::degree / 128;
+
+    w = negtwiddles[twid_id + 64];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // from level 8, we need to check size of params degree, because we support
+  // minimum actual polynomial size = 256,  when compressed size is halfed and
+  // minimum supported compressed size is 128, so we always need first 7
+  // levels of butterfly operation, since butterfly levels are hardcoded
+  // we need to check if polynomial size is big enough to require specific level
+  // of butterfly.
+  if constexpr (params::degree >= 256) {
+    // level 8
+    //tid = threadIdx.x;
+    //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+    for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+      tid = threadIdx.x + i * params::degree / params::opt;
+      twid_id = tid / (params::degree / 256);
+      i1 = 2 * (params::degree / 256) * twid_id +
+           (tid & (params::degree / 256 - 1));
+      i2 = i1 + params::degree / 256;
+
+      w = negtwiddles[twid_id + 128];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      //tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 512) {
+    // level 9
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 512);
+      i1 = 2 * (params::degree / 512) * twid_id +
+           (tid & (params::degree / 512 - 1));
+      i2 = i1 + params::degree / 512;
+
+      w = negtwiddles[twid_id + 256];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 1024) {
+    // level 10
+    //tid = threadIdx.x;
+    //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+    for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+      tid = threadIdx.x + i * params::degree / params::opt;
+      twid_id = tid / (params::degree / 1024);
+      i1 = 2 * (params::degree / 1024) * twid_id +
+           (tid & (params::degree / 1024 - 1));
+      i2 = i1 + params::degree / 1024;
+
+      w = negtwiddles[twid_id + 512];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      //tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 2048) {
+    // level 11
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 2048);
+      i1 = 2 * (params::degree / 2048) * twid_id +
+           (tid & (params::degree / 2048 - 1));
+      i2 = i1 + params::degree / 2048;
+
+      w = negtwiddles[twid_id + 1024];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 4096) {
+    // level 12
+    //tid = threadIdx.x;
+    //for (size_t i = 0; i < params::opt / 2; ++i) {
+#pragma unroll
+    for (int i = params::opt / 2 - 1; i >= 0 ; --i) {
+      tid = threadIdx.x + i * params::degree / params::opt;
+      twid_id = tid / (params::degree / 4096);
+      i1 = 2 * (params::degree / 4096) * twid_id +
+           (tid & (params::degree / 4096 - 1));
+      i2 = i1 + params::degree / 4096;
+
+      w = negtwiddles[twid_id + 2048];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      //tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 8192) {
+    // level 13
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 8192);
+      i1 = 2 * (params::degree / 8192) * twid_id +
+           (tid & (params::degree / 8192 - 1));
+      i2 = i1 + params::degree / 8192;
+
+      w = negtwiddles[twid_id + 4096];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+}
+
 #endif // GPU_BOOTSTRAP_FFT_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
@@ -1,6 +1,6 @@
 #include "cuComplex.h"

-__constant__ double2 negtwiddles[4096] = {
+__device__ double2 negtwiddles[8192] = {
    {0, 0},
    {0.707106781186547461715008466854, 0.707106781186547572737310929369},
    {0.92387953251128673848313610506, 0.382683432365089781779232680492},
@@ -4096,9 +4096,7 @@ __constant__ double2 negtwiddles[4096] = {
    {0.70791982920081630847874976098, 0.706292797233758484765075991163},
    {-0.706292797233758484765075991163, 0.70791982920081630847874976098},
    {0.00115048533711384847431913325266, 0.99999933819152553304832053982},
-    {-0.99999933819152553304832053982, 0.00115048533711384847431913325266}};
-
-__device__ double2 negtwiddles13[4096] = {
+    {-0.99999933819152553304832053982, 0.00115048533711384847431913325266},
    {0.999999981616429334252416083473, 0.000191747597310703291528452552051},
    {-0.000191747597310703291528452552051, 0.999999981616429334252416083473},
    {0.706971182161065359039753275283, 0.707242354213734603085583785287},
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
@@ -2,12 +2,7 @@
 #define GPU_BOOTSTRAP_TWIDDLES_CUH

 /*
- * 'negtwiddles' are stored in constant memory for faster access times
- * because of it's limited size, only twiddles for up to 2^12 polynomial size
- * can be stored there, twiddles for 2^13 are stored in device memory
- * 'negtwiddles13'
+ * 'negtwiddles' are stored in device memory to profit caching
 */
-
-extern __constant__ double2 negtwiddles[4096];
-extern __device__ double2 negtwiddles13[4096];
+extern __device__ double2 negtwiddles[8192];
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,13 +1,13 @@
 #include "integer/bitwise_ops.cuh"

 void scratch_cuda_integer_radix_bitop_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    BITOP_TYPE op_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -15,7 +15,7 @@ void scratch_cuda_integer_radix_bitop_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_bitop_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
      op_type, allocate_gpu_memory);
 }
@@ -23,34 +23,21 @@ void scratch_cuda_integer_radix_bitop_kb_64(
 void cuda_bitop_integer_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
-    void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
+    void **bsks, void **ksks, uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitop_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_1),
      static_cast<uint64_t *>(lwe_array_2),
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      lwe_ciphertext_count);
 }

-void cuda_bitnot_integer_radix_ciphertext_kb_64(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t lwe_ciphertext_count) {
-
-  host_integer_radix_bitnot_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_array_in),
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
-      lwe_ciphertext_count);
-}
-
-void cleanup_cuda_integer_bitop(void *stream, uint32_t gpu_index,
-                                int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
+                                uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -16,38 +16,25 @@ __host__ void
 host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                            uint32_t gpu_count, Torus *lwe_array_out,
                            Torus *lwe_array_1, Torus *lwe_array_2,
-                            int_bitop_buffer<Torus> *mem_ptr, void *bsk,
-                            Torus *ksk, uint32_t num_radix_blocks) {
+                            int_bitop_buffer<Torus> *mem_ptr, void **bsks,
+                            Torus **ksks, uint32_t num_radix_blocks) {

  auto lut = mem_ptr->lut;

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
-      bsk, ksk, num_radix_blocks, lut, lut->params.message_modulus);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_bitnot_kb(
-    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer<Torus> *mem_ptr,
-    void *bsk, Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto lut = mem_ptr->lut;
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk,
-      num_radix_blocks, lut);
+      bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_bitop_kb(
-    cudaStream_t stream, uint32_t gpu_index, int_bitop_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_bitop_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
-  *mem_ptr = new int_bitop_buffer<Torus>(stream, gpu_index, op, params,
-                                         num_radix_blocks, allocate_gpu_memory);
+  *mem_ptr =
+      new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op, params,
+                                  num_radix_blocks, allocate_gpu_memory);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,12 +1,13 @@
 #include "integer/cmux.cuh"

 void scratch_cuda_integer_radix_cmux_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -17,7 +18,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
      [](uint64_t x) -> uint64_t { return x == 1; };

  scratch_cuda_integer_radix_cmux_kb(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
 }
@@ -25,7 +26,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
 void cuda_cmux_integer_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
-    void *lwe_array_false, int8_t *mem_ptr, void *bsk, void *ksk,
+    void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
    uint32_t lwe_ciphertext_count) {

  host_integer_radix_cmux_kb<uint64_t>(
@@ -34,15 +35,16 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
      static_cast<uint64_t *>(lwe_condition),
      static_cast<uint64_t *>(lwe_array_true),
      static_cast<uint64_t *>(lwe_array_false),
-      (int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),

      lwe_ciphertext_count);
 }

-void cleanup_cuda_integer_radix_cmux(void *stream, uint32_t gpu_index,
+void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
+                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {

  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -9,8 +9,8 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
                          uint32_t gpu_count, Torus *lwe_array_out,
                          Torus *lwe_array_input, Torus *lwe_condition,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
-                          int_radix_lut<Torus> *predicate, void *bsk,
-                          Torus *ksk, uint32_t num_radix_blocks) {
+                          int_radix_lut<Torus> *predicate, void **bsks,
+                          Torus **ksks, uint32_t num_radix_blocks) {
  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

@@ -36,25 +36,26 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
  }

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsk,
-      ksk, num_radix_blocks, predicate);
+      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
+      ksks, num_radix_blocks, predicate);
 }

 template <typename Torus>
 __host__ void host_integer_radix_cmux_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
-    Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks) {
+    Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

  // Since our CPU threads will be working on different streams we shall assert
  // the work in the main stream is completed
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-  auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
-  auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
+  auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
+  auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }

 #pragma omp parallel sections
  {
@@ -62,20 +63,23 @@ __host__ void host_integer_radix_cmux_kb(
 #pragma omp section
    {
      auto mem_true = mem_ptr->zero_if_true_buffer;
-      zero_out_if(&true_stream, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+      zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
                  lwe_array_true, lwe_condition, mem_true,
-                  mem_ptr->inverted_predicate_lut, bsk, ksk, num_radix_blocks);
+                  mem_ptr->inverted_predicate_lut, bsks, ksks,
+                  num_radix_blocks);
    }
 #pragma omp section
    {
      auto mem_false = mem_ptr->zero_if_false_buffer;
-      zero_out_if(&false_stream, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
+      zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
                  lwe_array_false, lwe_condition, mem_false,
-                  mem_ptr->predicate_lut, bsk, ksk, num_radix_blocks);
+                  mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
    }
  }
-  cuda_synchronize_stream(true_stream, gpu_indexes[0]);
-  cuda_synchronize_stream(false_stream, gpu_indexes[0]);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
+    cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
+  }

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
@@ -86,19 +90,19 @@ __host__ void host_integer_radix_cmux_kb(
                num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsk, ksk,
+      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
      num_radix_blocks, mem_ptr->message_extract_lut);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_cmux_kb(
-    cudaStream_t stream, uint32_t gpu_index, int_cmux_buffer<Torus> **mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_cmux_buffer<Torus> **mem_ptr,
    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
-  *mem_ptr =
-      new int_cmux_buffer<Torus>(stream, gpu_index, predicate_lut_f, params,
-                                 num_radix_blocks, allocate_gpu_memory);
+  *mem_ptr = new int_cmux_buffer<Torus>(streams, gpu_indexes, gpu_count,
+                                        predicate_lut_f, params,
+                                        num_radix_blocks, allocate_gpu_memory);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,13 +1,13 @@
 #include "integer/comparison.cuh"

 void scratch_cuda_integer_radix_comparison_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -18,7 +18,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  case EQ:
  case NE:
    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        static_cast<cudaStream_t>(stream), gpu_index,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
        op_type, false, allocate_gpu_memory);
    break;
@@ -29,7 +29,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  case MAX:
  case MIN:
    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        static_cast<cudaStream_t>(stream), gpu_index,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
        op_type, is_signed, allocate_gpu_memory);
    break;
@@ -39,7 +39,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
 void cuda_comparison_integer_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
-    void *bsk, void *ksk, uint32_t num_radix_blocks) {
+    void **bsks, void **ksks, uint32_t num_radix_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -50,8 +50,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
-        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), num_radix_blocks);
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
+        num_radix_blocks);
    break;
  case GT:
  case GE:
@@ -62,7 +62,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer,
-        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
+        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
        num_radix_blocks);
    break;
  case MAX:
@@ -71,18 +71,19 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
-        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), num_radix_blocks);
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
+        num_radix_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
  }
 }

-void cleanup_cuda_integer_comparison(void *stream, uint32_t gpu_index,
+void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
+                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {

  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -56,12 +56,11 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
 *
 */
 template <typename Torus>
-__host__ void
-are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
-                               uint32_t gpu_count, Torus *lwe_array_out,
-                               Torus *lwe_array_in,
-                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                               Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void are_all_comparisons_block_true(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
@@ -94,6 +93,8 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
    // as in the worst case we will be adding `max_value` ones
    auto input_blocks = tmp_out;
    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
+    auto is_equal_to_num_blocks_map =
+        &are_all_block_true_buffer->is_equal_to_lut_map;
    for (int i = 0; i < num_chunks; i++) {
      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
                            input_blocks, big_lwe_dimension, chunk_length);
@@ -103,8 +104,6 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
      input_blocks += (big_lwe_dimension + 1) * chunk_length;
    }
    accumulator = are_all_block_true_buffer->tmp_block_accumulated;
-    auto is_equal_to_num_blocks_map =
-        &are_all_block_true_buffer->is_equal_to_lut_map;

    // Selects a LUT
    int_radix_lut<Torus> *lut;
@@ -119,7 +118,7 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
      } else {
        // LUT needs to be computed
        auto new_lut =
-            new int_radix_lut<Torus>(streams[0], gpu_indexes[0], params,
+            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                     max_value, num_radix_blocks, true);

        auto is_equal_to_num_blocks_lut_f = [max_value,
@@ -127,10 +126,12 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
          return (x & max_value) == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            streams[0], gpu_indexes[0], new_lut->lut, glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
+            streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
+            glwe_dimension, polynomial_size, message_modulus, carry_modulus,
            is_equal_to_num_blocks_lut_f);

+        new_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
        lut = new_lut;
      }
@@ -140,12 +141,12 @@ are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
-          1, lut);
+          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
+          ksks, 1, lut);
      return;
    } else {
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsk, ksk,
+          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
          num_chunks, lut);
    }
  }
@@ -161,7 +162,7 @@ template <typename Torus>
 __host__ void is_at_least_one_comparisons_block_true(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

  cudaSetDevice(gpu_indexes[0]);
@@ -207,13 +208,13 @@ __host__ void is_at_least_one_comparisons_block_true(
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
-          1, lut);
+          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
+          ksks, 1, lut);
      return;
    } else {
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
-          accumulator, bsk, ksk, num_chunks, lut);
+          accumulator, bsks, ksks, num_chunks, lut);
    }
  }
 }
@@ -241,7 +242,7 @@ template <typename Torus>
 __host__ void host_compare_with_zero_equality(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {

  cudaSetDevice(gpu_indexes[0]);
@@ -293,27 +294,26 @@ __host__ void host_compare_with_zero_equality(
  }

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, sum, sum, bsk, ksk, num_sum_blocks,
+      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
      zero_comparison);
  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 sum, mem_ptr, bsk, ksk, num_sum_blocks);
+                                 sum, mem_ptr, bsks, ksks, num_sum_blocks);
 }

 template <typename Torus>
 __host__ void host_integer_radix_equality_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto eq_buffer = mem_ptr->eq_buffer;

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
  integer_radix_apply_bivariate_lookup_table_kb(
      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
-      bsk, ksk, num_radix_blocks, eq_buffer->operator_lut,
+      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
      eq_buffer->operator_lut->params.message_modulus);

  // This takes a Vec of blocks, where each block is either 0 or 1.
@@ -321,7 +321,7 @@ __host__ void host_integer_radix_equality_check_kb(
  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
-                                 comparisons, mem_ptr, bsk, ksk,
+                                 comparisons, mem_ptr, bsks, ksks,
                                 num_radix_blocks);
 }

@@ -330,10 +330,9 @@ __host__ void
 compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                        uint32_t gpu_count, Torus *lwe_array_out,
                        Torus *lwe_array_left, Torus *lwe_array_right,
-                        int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                        Torus *ksk, uint32_t num_radix_blocks) {
+                        int_comparison_buffer<Torus> *mem_ptr, void **bsks,
+                        Torus **ksks, uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -360,7 +359,7 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsk, ksk,
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
      num_radix_blocks, is_non_zero_lut);

  // Add one
@@ -380,10 +379,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
                    uint32_t gpu_count, Torus *lwe_array_out,
                    Torus *lwe_block_comparisons,
                    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
-                    std::function<Torus(Torus)> sign_handler_f, void *bsk,
-                    Torus *ksk, uint32_t num_radix_blocks) {
+                    std::function<Torus(Torus)> sign_handler_f, void **bsks,
+                    Torus **ksks, uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = tree_buffer->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -413,7 +411,7 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
                partial_block_count, 4);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, x, y, bsk, ksk,
+        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
        partial_block_count >> 1, inner_tree_leaf);

    if ((partial_block_count % 2) != 0) {
@@ -451,13 +449,15 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
    y = x;
    f = sign_handler_f;
  }
-  generate_device_accumulator<Torus>(streams[0], gpu_indexes[0], last_lut->lut,
-                                     glwe_dimension, polynomial_size,
-                                     message_modulus, carry_modulus, f);
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], last_lut->get_lut(gpu_indexes[0], 0),
+      glwe_dimension, polynomial_size, message_modulus, carry_modulus, f);
+  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsk, ksk, 1, last_lut);
+  integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
+                                                 gpu_count, lwe_array_out, y,
+                                                 bsks, ksks, 1, last_lut);
 }

 template <typename Torus>
@@ -465,10 +465,9 @@ __host__ void host_integer_radix_difference_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
    int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
+    std::function<Torus(Torus)> reduction_lut_f, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
@@ -500,10 +499,10 @@ __host__ void host_integer_radix_difference_check_kb(
    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
    integer_radix_apply_univariate_lookup_table_kb(
-        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsk, ksk,
+        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
        packed_num_radix_blocks, identity_lut);
    integer_radix_apply_univariate_lookup_table_kb(
-        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsk, ksk,
+        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
        packed_num_radix_blocks, identity_lut);

    lhs = packed_left;
@@ -520,14 +519,15 @@ __host__ void host_integer_radix_difference_check_kb(
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                            rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
+                            rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
-                              rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
+                              rhs, mem_ptr, bsks, ksks,
+                              packed_num_radix_blocks);

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
@@ -538,37 +538,37 @@ __host__ void host_integer_radix_difference_check_kb(
          packed_num_radix_blocks * big_lwe_size;
      integer_radix_apply_univariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
-          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
+          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
      integer_radix_apply_univariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
-          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
-          identity_lut);
+          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
+          1, identity_lut);
      compare_radix_blocks_kb(
          streams, gpu_indexes, gpu_count,
          comparisons + packed_num_radix_blocks * big_lwe_size,
          last_left_block_before_sign_block, last_right_block_before_sign_block,
-          mem_ptr, bsk, ksk, 1);
+          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
      integer_radix_apply_bivariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count,
          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
-          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
-          mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
+          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks,
+          1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                              lwe_array_left, lwe_array_right, mem_ptr, bsk,
-                              ksk, num_radix_blocks - 1);
+                              lwe_array_left, lwe_array_right, mem_ptr, bsks,
+                              ksks, num_radix_blocks - 1);
      // Compare the sign block separately
      integer_radix_apply_bivariate_lookup_table_kb(
          streams, gpu_indexes, gpu_count,
          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
-          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
-          mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
+          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks,
+          1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
      num_comparisons = num_radix_blocks;
    }
  }
@@ -578,20 +578,19 @@ __host__ void host_integer_radix_difference_check_kb(
  // final sign
  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
                      comparisons, mem_ptr->diff_buffer->tree_buffer,
-                      reduction_lut_f, bsk, ksk, num_comparisons);
+                      reduction_lut_f, bsks, ksks, num_comparisons);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_comparison_check_kb(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
    int_radix_params params, COMPARISON_TYPE op, bool is_signed,
    bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
-  *mem_ptr = new int_comparison_buffer<Torus>(stream, gpu_index, op, params,
-                                              num_radix_blocks, is_signed,
-                                              allocate_gpu_memory);
+  *mem_ptr = new int_comparison_buffer<Torus>(streams, gpu_indexes, gpu_count,
+                                              op, params, num_radix_blocks,
+                                              is_signed, allocate_gpu_memory);
 }

 template <typename Torus>
@@ -599,20 +598,19 @@ __host__ void
 host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                             uint32_t gpu_count, Torus *lwe_array_out,
                             Torus *lwe_array_left, Torus *lwe_array_right,
-                             int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                             Torus *ksk, uint32_t total_num_radix_blocks) {
+                             int_comparison_buffer<Torus> *mem_ptr, void **bsks,
+                             Torus **ksks, uint32_t total_num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  // Compute the sign
  host_integer_radix_difference_check_kb(
      streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
-      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsk,
-      ksk, total_num_radix_blocks);
+      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
+      ksks, total_num_radix_blocks);

  // Selector
  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
+                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
                             total_num_radix_blocks);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -1,12 +1,12 @@
 #include "integer/div_rem.cuh"

 void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -14,15 +14,15 @@ void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_div_rem_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }

 void cuda_integer_div_rem_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
-    void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t num_blocks) {
+    void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks) {

  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

@@ -32,7 +32,7 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
    break;
  case 1024:

@@ -40,35 +40,35 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
    break;
  case 2048:
    host_integer_div_rem_kb<uint64_t, Degree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
    break;
  case 4096:
    host_integer_div_rem_kb<uint64_t, Degree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
    break;
  case 8192:
    host_integer_div_rem_kb<uint64_t, Degree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
    break;
  case 16384:
    host_integer_div_rem_kb<uint64_t, Degree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
-        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
    break;
  default:
    PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
@@ -76,10 +76,10 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
  }
 }

-void cleanup_cuda_integer_div_rem(void *stream, uint32_t gpu_index,
-                                  int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
+                                  uint32_t gpu_count, int8_t **mem_ptr_void) {
  int_div_rem_memory<uint64_t> *mem_ptr =
      (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -164,22 +164,21 @@ template <typename Torus> struct lwe_ciphertext_list {
 };

 template <typename Torus>
-__host__ void
-scratch_cuda_integer_div_rem_kb(cudaStream_t stream, uint32_t gpu_index,
-                                int_div_rem_memory<Torus> **mem_ptr,
-                                uint32_t num_blocks, int_radix_params params,
-                                bool allocate_gpu_memory) {
+__host__ void scratch_cuda_integer_div_rem_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_div_rem_memory<Torus> **mem_ptr, uint32_t num_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

-  *mem_ptr = new int_div_rem_memory<Torus>(stream, gpu_index, params,
-                                           num_blocks, allocate_gpu_memory);
+  *mem_ptr = new int_div_rem_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }

 template <typename Torus, class params>
 __host__ void
 host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                        uint32_t gpu_count, Torus *quotient, Torus *remainder,
-                        Torus *numerator, Torus *divisor, void *bsk,
-                        uint64_t *ksk, int_div_rem_memory<uint64_t> *mem_ptr,
+                        Torus *numerator, Torus *divisor, void **bsks,
+                        uint64_t **ksks, int_div_rem_memory<uint64_t> *mem_ptr,
                        uint32_t num_blocks) {

  auto radix_params = mem_ptr->params;
@@ -290,7 +289,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

          integer_radix_apply_univariate_lookup_table_kb(
              streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
-              interesting_divisor.last_block(), bsk, ksk, 1,
+              interesting_divisor.last_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_1[shifted_mask]);
        }; // trim_last_interesting_divisor_bits

@@ -318,7 +317,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

          integer_radix_apply_univariate_lookup_table_kb(
              streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
-              divisor_ms_blocks.first_block(), bsk, ksk, 1,
+              divisor_ms_blocks.first_block(), bsks, ksks, 1,
              mem_ptr->masking_luts_2[shifted_mask]);
        }; // trim_first_divisor_ms_bits

@@ -342,7 +341,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,

          host_integer_radix_logical_scalar_shift_kb_inplace(
              streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
-              mem_ptr->shift_mem_1, bsk, ksk, interesting_remainder1.len);
+              mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);

          tmp_radix.clone_from(interesting_remainder1, 0,
                               interesting_remainder1.len - 1, streams[0],
@@ -371,41 +370,46 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
          host_integer_radix_logical_scalar_shift_kb_inplace(
              streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
-              mem_ptr->shift_mem_2, bsk, ksk, interesting_remainder2.len);
+              mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
        }; // left_shift_interesting_remainder2

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
 #pragma omp parallel sections
    {
 #pragma omp section
      {
        // interesting_divisor
-        trim_last_interesting_divisor_bits(&mem_ptr->sub_stream_1,
-                                           &gpu_indexes[0], 1);
+        trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
+                                           gpu_count);
      }
 #pragma omp section
      {
        // divisor_ms_blocks
-        trim_first_divisor_ms_bits(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
+        trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
+                                   gpu_count);
      }
 #pragma omp section
      {
        // interesting_remainder1
        // numerator_block_stack
-        left_shift_interesting_remainder1(&mem_ptr->sub_stream_3,
-                                          &gpu_indexes[0], 1);
+        left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
+                                          gpu_count);
      }
 #pragma omp section
      {
        // interesting_remainder2
-        left_shift_interesting_remainder2(&mem_ptr->sub_stream_4,
-                                          &gpu_indexes[0], 1);
+        left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
+                                          gpu_count);
      }
    }
-    cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
-    cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
-    cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
-    cuda_synchronize_stream(mem_ptr->sub_stream_4, gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
+    }

    // if interesting_remainder1 != 0 -> interesting_remainder2 == 0
    // if interesting_remainder1 == 0 -> interesting_remainder2 != 0
@@ -438,7 +442,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
      host_integer_overflowing_sub_kb<Torus, params>(
          streams, gpu_indexes, gpu_count, new_remainder.data,
          subtraction_overflowed.data, merged_interesting_remainder.data,
-          interesting_divisor.data, bsk, ksk, mem_ptr->overflow_sub_mem,
+          interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
          merged_interesting_remainder.len);
    };

@@ -458,7 +462,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        // So we can skip some stuff
        host_compare_with_zero_equality(
            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
-            mem_ptr->comparison_buffer, bsk, ksk, trivial_blocks.len,
+            mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);

        tmp_1.len =
@@ -467,7 +471,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        is_at_least_one_comparisons_block_true(
            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
-            mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len);
+            mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
      }
    };

@@ -480,36 +484,41 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
          integer_radix_apply_univariate_lookup_table_kb(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
-              cleaned_merged_interesting_remainder.data, bsk, ksk,
+              cleaned_merged_interesting_remainder.data, bsks, ksks,
              cleaned_merged_interesting_remainder.len,
              mem_ptr->message_extract_lut_1);
        };

    // phase 2
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
 #pragma omp parallel sections
    {
 #pragma omp section
      {
        // new_remainder
        // subtraction_overflowed
-        do_overflowing_sub(&mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
+        do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
      }
 #pragma omp section
      {
        // at_least_one_upper_block_is_non_zero
-        check_divisor_upper_blocks(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
+        check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
+                                   gpu_count);
      }
 #pragma omp section
      {
        // cleaned_merged_interesting_remainder
-        create_clean_version_of_merged_remainder(&mem_ptr->sub_stream_3,
-                                                 &gpu_indexes[0], 1);
+        create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
+                                                 gpu_indexes, gpu_count);
      }
    }
-    cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
-    cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
-    cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+    }

    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
                  subtraction_overflowed.data,
@@ -528,7 +537,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data,
-              overflow_sum_radix.data, bsk, ksk,
+              overflow_sum_radix.data, bsks, ksks,
              cleaned_merged_interesting_remainder.len,
              mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
              factor);
@@ -538,7 +547,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, new_remainder.data,
-              new_remainder.data, overflow_sum_radix.data, bsk, ksk,
+              new_remainder.data, overflow_sum_radix.data, bsks, ksks,
              new_remainder.len,
              mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
        };
@@ -548,7 +557,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, did_not_overflow.data,
          subtraction_overflowed.data,
-          at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1,
+          at_least_one_upper_block_is_non_zero.data, bsks, ksks, 1,
          mem_ptr->merge_overflow_flags_luts[pos_in_block],
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);
@@ -559,30 +568,34 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
    };

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
 #pragma omp parallel sections
    {
 #pragma omp section
      {
        // cleaned_merged_interesting_remainder
        conditionally_zero_out_merged_interesting_remainder(
-            &mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
+            mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
      }
 #pragma omp section
      {
        // new_remainder
-        conditionally_zero_out_merged_new_remainder(&mem_ptr->sub_stream_2,
-                                                    &gpu_indexes[0], 1);
+        conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
+                                                    gpu_indexes, gpu_count);
      }
 #pragma omp section
      {
        // quotient
-        set_quotient_bit(&mem_ptr->sub_stream_3, &gpu_indexes[0], 1);
+        set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
      }
    }
-    cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
-    cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
-    cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+    }

    assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
    assert(first_trivial_block - 1 == new_remainder.len);
@@ -601,24 +614,28 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                remainder2.data, radix_params.big_lwe_dimension,
                remainder1.len);

-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }
 #pragma omp parallel sections
  {
 #pragma omp section
    {
      integer_radix_apply_univariate_lookup_table_kb(
-          &mem_ptr->sub_stream_1, &gpu_indexes[0], 1, remainder, remainder, bsk,
-          ksk, num_blocks, mem_ptr->message_extract_lut_1);
+          mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
+          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
    }
 #pragma omp section
    {
      integer_radix_apply_univariate_lookup_table_kb(
-          &mem_ptr->sub_stream_2, &gpu_indexes[0], 1, quotient, quotient, bsk,
-          ksk, num_blocks, mem_ptr->message_extract_lut_2);
+          mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
+          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
    }
  }
-  cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
-  cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+  }
 }

 #endif // TFHE_RS_DIV_REM_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,127 +1,54 @@
 #include "integer/integer.cuh"
 #include <linear_algebra.h>

-void cuda_full_propagation_64_inplace(
-    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *input_blocks, int8_t *mem_ptr, void *ksk, void *bsk,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t pbs_base_log,
-    uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_blocks) {
+void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
+                                      uint32_t gpu_count, void *input_blocks,
+                                      int8_t *mem_ptr, void **ksks, void **bsks,
+                                      uint32_t num_blocks) {

-  switch (polynomial_size) {
-  case 256:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 512:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 1024:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 2048:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 4096:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 8192:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 16384:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-        static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  default:
-    PANIC("Cuda error (full propagation inplace): unsupported polynomial size. "
-          "Supported N's are powers of two"
-          " in the interval [256..16384].")
-  }
+  int_fullprop_buffer<uint64_t> *buffer =
+      (int_fullprop_buffer<uint64_t> *)mem_ptr;
+
+  host_full_propagate_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(input_blocks), buffer, (uint64_t **)(ksks), bsks,
+      num_blocks);
 }

 void scratch_cuda_full_propagation_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory) {
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);

  scratch_cuda_full_propagation<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension, glwe_dimension,
-      polynomial_size, level_count, grouping_factor, input_lwe_ciphertext_count,
-      message_modulus, carry_modulus, pbs_type, allocate_gpu_memory);
+      (cudaStream_t *)streams, gpu_indexes, gpu_count,
+      (int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
+      allocate_gpu_memory);
 }

-void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index,
-                                   int8_t **mem_ptr_void) {
+void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
+                                   uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_fullprop_buffer<uint64_t> *mem_ptr =
      (int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
-  auto s = static_cast<cudaStream_t>(stream);

-  cuda_drop_async(mem_ptr->lut_buffer, s, gpu_index);
-  cuda_drop_async(mem_ptr->lut_indexes, s, gpu_index);
-
-  cuda_drop_async(mem_ptr->lwe_indexes, s, gpu_index);
-
-  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, s, gpu_index);
-  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, s, gpu_index);
-
-  switch (mem_ptr->pbs_type) {
-  case CLASSICAL: {
-    auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(mem_ptr->pbs_buffer);
-    x->release(s, gpu_index);
-  } break;
-  case MULTI_BIT: {
-    auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
-    x->release(s, gpu_index);
-  } break;
-  default:
-    PANIC("Cuda error (PBS): unsupported implementation variant.")
-  }
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

 void scratch_cuda_propagate_single_carry_kb_64_inplace(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -129,35 +56,49 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                          message_modulus, carry_modulus);

  scratch_cuda_propagate_single_carry_kb_inplace(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }

 void cuda_propagate_single_carry_kb_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
+    void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {
  host_propagate_single_carry<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array),
-      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      nullptr, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

-void cleanup_cuda_propagate_single_carry(void *stream, uint32_t gpu_index,
+void cuda_propagate_single_carry_get_input_carries_kb_64_inplace(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *carry_out, void *input_carries, int8_t *mem_ptr, void **bsks,
+    void **ksks, uint32_t num_blocks) {
+  host_propagate_single_carry<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      static_cast<uint64_t *>(input_carries),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks);
+}
+
+void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
+                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void) {
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

 void scratch_cuda_apply_univariate_lut_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, void *input_lut,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
-    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -165,7 +106,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
                          grouping_factor, message_modulus, carry_modulus);

  scratch_cuda_apply_univariate_lut_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
      num_radix_blocks, params, allocate_gpu_memory);
 }
@@ -173,19 +114,64 @@ void scratch_cuda_apply_univariate_lut_kb_64(
 void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
                                     uint32_t gpu_count, void *output_radix_lwe,
                                     void *input_radix_lwe, int8_t *mem_ptr,
-                                     void *ksk, void *bsk,
+                                     void **ksks, void **bsks,
                                     uint32_t num_blocks) {

  host_apply_univariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(output_radix_lwe),
      static_cast<uint64_t *>(input_radix_lwe),
-      (int_radix_lut<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk), bsk,
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
      num_blocks);
 }

-void cleanup_cuda_apply_univariate_lut_kb_64(void *stream, uint32_t gpu_index,
+void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
+                                             uint32_t *gpu_indexes,
+                                             uint32_t gpu_count,
                                             int8_t **mem_ptr_void) {
  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void scratch_cuda_apply_bivariate_lut_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
+      num_radix_blocks, params, allocate_gpu_memory);
+}
+
+void cuda_apply_bivariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
+                                    uint32_t gpu_count, void *output_radix_lwe,
+                                    void *input_radix_lwe_1,
+                                    void *input_radix_lwe_2, int8_t *mem_ptr,
+                                    void **ksks, void **bsks,
+                                    uint32_t num_blocks, uint32_t shift) {
+
+  host_apply_bivariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe_1),
+      static_cast<uint64_t *>(input_radix_lwe_2),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
+      shift);
+}
+
+void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
+                                            uint32_t *gpu_indexes,
+                                            uint32_t gpu_count,
+                                            int8_t **mem_ptr_void) {
+  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -72,7 +72,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_right<<<blocks_count, 256, 0, streams[0]>>>(
+  radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -89,7 +89,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
          "pointers should be different");
  }
  cudaSetDevice(gpu_indexes[0]);
-  radix_blocks_rotate_left<<<blocks_count, 256, 0, streams[0]>>>(
+  radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
      dst, src, value, blocks_count, lwe_size);
 }

@@ -138,7 +138,7 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
 template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *lwe_array_out, Torus *lwe_array_in, void *bsk, Torus *ksk,
+    Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
  // apply_lookup_table
  auto params = lut->params;
@@ -153,30 +153,38 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

-  // Compute Keyswitch-PBS
-  cuda_keyswitch_lwe_ciphertext_vector(
-      streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
-      lut->lwe_trivial_indexes, lwe_array_in, lut->lwe_indexes_in, ksk,
-      big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
-      num_radix_blocks);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  /// Apply KS to go from a big LWE dimension to a small LWE dimension
+  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
+                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
+                           lwe_array_in, lut->lwe_indexes_in, ksks,
+                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
+                           ks_level, num_radix_blocks, false);

-  execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
-                     lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
-                     lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
-                     lut->buffer, glwe_dimension, small_lwe_dimension,
-                     polynomial_size, pbs_base_log, pbs_level, grouping_factor,
-                     num_radix_blocks, 1, 0,
-                     cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  execute_pbs<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
+      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
+      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
+      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+      grouping_factor, num_radix_blocks, 1, 0,
+      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+
+  /// Synchronize all GPUs
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  for (uint i = 0; i < active_gpu_count; i++) {
+    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  }
 }

 template <typename Torus>
 __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
+    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
    uint32_t shift) {
-  cudaSetDevice(gpu_indexes[0]);
-  // apply_lookup_table_bivariate
+
  auto params = lut->params;
  auto pbs_type = params.pbs_type;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -188,7 +196,6 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;
-  auto message_modulus = params.message_modulus;

  // Left message is shifted
  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
@@ -198,20 +205,30 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
                        num_radix_blocks);
  check_cuda_error(cudaGetLastError());

-  // Apply LUT
-  cuda_keyswitch_lwe_ciphertext_vector(
-      streams[0], gpu_indexes[0], lut->tmp_lwe_after_ks,
-      lut->lwe_trivial_indexes, lwe_array_pbs_in, lut->lwe_trivial_indexes, ksk,
-      big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
-      num_radix_blocks);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);

-  execute_pbs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
-                     lut->lwe_indexes_out, lut->lut, lut->lut_indexes,
-                     lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, bsk,
-                     lut->buffer, glwe_dimension, small_lwe_dimension,
-                     polynomial_size, pbs_base_log, pbs_level, grouping_factor,
-                     num_radix_blocks, 1, 0,
-                     cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+  /// Apply KS to go from a big LWE dimension to a small LWE dimension
+  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
+                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
+                           lwe_array_pbs_in, lut->lwe_indexes_in, ksks,
+                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
+                           ks_level, num_radix_blocks, false);
+
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  execute_pbs<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
+      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
+      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
+      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+      grouping_factor, num_radix_blocks, 1, 0,
+      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+
+  /// Synchronize all GPUs
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  for (uint i = 0; i < active_gpu_count; i++) {
+    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  }
 }

 // Rotates the slice in-place such that the first mid elements of the slice move
@@ -317,7 +334,7 @@ void generate_device_accumulator_bivariate(
  generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
                                         message_modulus, carry_modulus, f);

-  // copy host lut and lut_indexes to device
+  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                           (glwe_dimension + 1) * polynomial_size *
                               sizeof(Torus),
@@ -351,7 +368,7 @@ void generate_device_accumulator_bivariate_with_factor(
      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
      factor);

-  // copy host lut and lut_indexes to device
+  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                           (glwe_dimension + 1) * polynomial_size *
                               sizeof(Torus),
@@ -386,7 +403,7 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                               message_modulus, carry_modulus, f);

-  // copy host lut and lut_indexes to device
+  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream, gpu_index);
@@ -398,20 +415,21 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,

 template <typename Torus>
 void scratch_cuda_propagate_single_carry_kb_inplace(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_sc_prop_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
-  *mem_ptr = new int_sc_prop_memory<Torus>(
-      stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory);
+  *mem_ptr =
+      new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                    num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
 void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 uint32_t gpu_count, Torus *lwe_array,
-                                 int_sc_prop_memory<Torus> *mem, void *bsk,
-                                 Torus *ksk, uint32_t num_blocks) {
+                                 Torus *carry_out, Torus *input_carries,
+                                 int_sc_prop_memory<Torus> *mem, void **bsks,
+                                 Torus **ksks, uint32_t num_blocks) {
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -426,14 +444,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
  auto message_acc = mem->message_acc;

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk,
-      ksk, num_blocks, luts_array);
+      streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks,
+      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele

  int num_steps = ceil(log2((double)num_blocks));
  int space = 1;
-  cudaSetDevice(gpu_indexes[0]);
  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
                               big_lwe_size_bytes * num_blocks, streams[0],
                               gpu_indexes[0]);
@@ -445,28 +462,37 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
-        bsk, ksk, cur_total_blocks, luts_carry_propagation_sum,
+        bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
        luts_carry_propagation_sum->params.message_modulus);

-    cudaSetDevice(gpu_indexes[0]);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    cuda_memcpy_async_gpu_to_gpu(
        &generates_or_propagates[space * big_lwe_size], cur_blocks,
        big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
    space *= 2;
  }

-  cudaSetDevice(gpu_indexes[0]);
  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
                                 generates_or_propagates, 1, num_blocks,
                                 big_lwe_size);
+  if (carry_out != nullptr) {
+    cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
+                                 streams[0], gpu_indexes[0]);
+  }
  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
                    gpu_indexes[0]);

+  if (input_carries != nullptr) {
+    cuda_memcpy_async_gpu_to_gpu(input_carries, step_output,
+                                 big_lwe_size_bytes * num_blocks, streams[0],
+                                 gpu_indexes[0]);
+  }
+
  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
                glwe_dimension * polynomial_size, num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk,
+      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
      num_blocks, message_acc);
 }

@@ -475,9 +501,8 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
                                      uint32_t *gpu_indexes, uint32_t gpu_count,
                                      Torus *overflowed, Torus *lwe_array,
                                      int_single_borrow_prop_memory<Torus> *mem,
-                                      void *bsk, Torus *ksk,
+                                      void **bsks, Torus **ksks,
                                      uint32_t num_blocks) {
-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -492,8 +517,8 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
  auto message_acc = mem->message_acc;

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsk,
-      ksk, num_blocks, luts_array);
+      streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks,
+      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
  int num_steps = ceil(log2((double)num_blocks));
@@ -509,7 +534,7 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
-        bsk, ksk, cur_total_blocks, luts_carry_propagation_sum,
+        bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
        luts_carry_propagation_sum->params.message_modulus);

    cuda_memcpy_async_gpu_to_gpu(
@@ -532,41 +557,39 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
                   step_output, glwe_dimension * polynomial_size, num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsk, ksk,
+      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
      num_blocks, message_acc);
 }

 /*
 * input_blocks: input radix ciphertext propagation will happen inplace
 * acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
- * lut_indexes_message_carry: lut_indexes for message and carry, should always
- * be  {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
- * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
- *     size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
+ * lut_indexes_message_carry: lut_indexes_vec for message and carry, should
+ * always be  {0, 1} small_lwe_vector: output of keyswitch should have size = 2
+ * * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should
+ * have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
 */
-template <typename Torus, typename STorus, class params>
+template <typename Torus>
 void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                 uint32_t gpu_count, Torus *input_blocks,
                                 int_fullprop_buffer<Torus> *mem_ptr,
-                                 Torus *ksk, void *bsk, uint32_t lwe_dimension,
-                                 uint32_t glwe_dimension,
-                                 uint32_t polynomial_size, uint32_t ks_base_log,
-                                 uint32_t ks_level, uint32_t pbs_base_log,
-                                 uint32_t pbs_level, uint32_t grouping_factor,
+                                 Torus **ksks, void **bsks,
                                 uint32_t num_blocks) {
+  auto params = mem_ptr->lut->params;

-  cudaSetDevice(gpu_indexes[0]);
-  int big_lwe_size = (glwe_dimension * polynomial_size + 1);
-  int small_lwe_size = (lwe_dimension + 1);
+  int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
+  int small_lwe_size = (params.small_lwe_dimension + 1);

  for (int i = 0; i < num_blocks; i++) {
    auto cur_input_block = &input_blocks[i * big_lwe_size];

+    cudaSetDevice(gpu_indexes[0]);
+    /// Since the keyswitch is done on one input only, use only 1 GPU
    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
        streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
-        mem_ptr->lwe_indexes, cur_input_block, mem_ptr->lwe_indexes, ksk,
-        polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
-        1);
+        mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
+        mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension,
+        params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);

    cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
                                 mem_ptr->tmp_small_lwe_vector,
@@ -575,11 +598,13 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,

    execute_pbs<Torus>(
        streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
-        mem_ptr->lwe_indexes, mem_ptr->lut_buffer, mem_ptr->lut_indexes,
-        mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
-        mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
-        cuda_get_max_shared_memory(gpu_indexes[0]), mem_ptr->pbs_type);
+        mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
+        mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
+        mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
+        params.glwe_dimension, params.small_lwe_dimension,
+        params.polynomial_size, params.pbs_base_log, params.pbs_level,
+        params.grouping_factor, 2, 2, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type);

    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
                                 big_lwe_size * sizeof(Torus), streams[0],
@@ -590,108 +615,22 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
      host_addition(streams[0], gpu_indexes[0], next_input_block,
                    next_input_block,
                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                    glwe_dimension * polynomial_size, 1);
+                    params.big_lwe_dimension, 1);
    }
  }
 }

 template <typename Torus>
-void scratch_cuda_full_propagation(
-    cudaStream_t stream, uint32_t gpu_index,
-    int_fullprop_buffer<Torus> **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t pbs_level,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                   uint32_t gpu_count,
+                                   int_fullprop_buffer<Torus> **mem_ptr,
+                                   int_radix_params params,
+                                   uint32_t num_radix_blocks,
+                                   bool allocate_gpu_memory) {

-  int8_t *pbs_buffer;
-  execute_scratch_pbs<Torus>(
-      stream, gpu_index, &pbs_buffer, glwe_dimension, lwe_dimension,
-      polynomial_size, pbs_level, grouping_factor, num_radix_blocks,
-      cuda_get_max_shared_memory(gpu_index), pbs_type, allocate_gpu_memory);
-
-  // LUT
-  Torus *lut_buffer;
-  if (allocate_gpu_memory) {
-    // LUT is used as a trivial encryption, so we only allocate memory for the
-    // body
-    Torus lut_buffer_size =
-        2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
-
-    lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream, gpu_index);
-
-    // LUTs
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    //
-    Torus *lut_buffer_message = lut_buffer;
-    Torus *lut_buffer_carry =
-        lut_buffer + (glwe_dimension + 1) * polynomial_size;
-
-    generate_device_accumulator<Torus>(
-        stream, gpu_index, lut_buffer_message, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, lut_f_message);
-
-    generate_device_accumulator<Torus>(
-        stream, gpu_index, lut_buffer_carry, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, lut_f_carry);
-  }
-
-  Torus *lut_indexes;
-  if (allocate_gpu_memory) {
-    lut_indexes =
-        (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream, gpu_index);
-
-    Torus h_lut_indexes[2] = {0, 1};
-    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
-                             stream, gpu_index);
-  }
-
-  Torus *lwe_indexes;
-  if (allocate_gpu_memory) {
-    Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
-
-    lwe_indexes =
-        (Torus *)cuda_malloc_async(lwe_indexes_size, stream, gpu_index);
-    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
-    for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
-    cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
-                             stream, gpu_index);
-    cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
-                             h_lwe_indexes);
-  }
-
-  // Temporary arrays
-  Torus *small_lwe_vector;
-  Torus *big_lwe_vector;
-  if (allocate_gpu_memory) {
-    Torus small_vector_size = 2 * (lwe_dimension + 1) * sizeof(Torus);
-    Torus big_vector_size =
-        2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
-
-    small_lwe_vector =
-        (Torus *)cuda_malloc_async(small_vector_size, stream, gpu_index);
-    big_lwe_vector =
-        (Torus *)cuda_malloc_async(big_vector_size, stream, gpu_index);
-  }
-
-  *mem_ptr = new int_fullprop_buffer<Torus>;
-
-  (*mem_ptr)->pbs_type = pbs_type;
-  (*mem_ptr)->pbs_buffer = pbs_buffer;
-
-  (*mem_ptr)->lut_buffer = lut_buffer;
-  (*mem_ptr)->lut_indexes = lut_indexes;
-  (*mem_ptr)->lwe_indexes = lwe_indexes;
-
-  (*mem_ptr)->tmp_small_lwe_vector = small_lwe_vector;
-  (*mem_ptr)->tmp_big_lwe_vector = big_lwe_vector;
+  *mem_ptr =
+      new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
+                                     num_radix_blocks, allocate_gpu_memory);
 }

 // (lwe_dimension+1) threads
@@ -740,7 +679,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,

  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
  device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
 }
@@ -800,22 +739,22 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
 template <typename Torus>
 __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
                             uint32_t gpu_count, Torus *lwe_array_out,
-                             Torus *lwe_array_in, void *bsk, Torus *ksk,
+                             Torus *lwe_array_in, void **bsks, Torus **ksks,
                             uint32_t num_radix_blocks, uint32_t bits_per_block,
                             int_bit_extract_luts_buffer<Torus> *bit_extract) {

  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk,
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
      num_radix_blocks * bits_per_block, bit_extract->lut);
 }

 template <typename Torus>
-__host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
-                           uint32_t gpu_count, Torus *signs_array_out,
-                           Torus *signs_array_in,
-                           int_comparison_buffer<Torus> *mem_ptr,
-                           std::function<Torus(Torus)> sign_handler_f,
-                           void *bsk, Torus *ksk, uint32_t num_sign_blocks) {
+__host__ void
+reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+             Torus *signs_array_out, Torus *signs_array_in,
+             int_comparison_buffer<Torus> *mem_ptr,
+             std::function<Torus(Torus)> sign_handler_f, void **bsks,
+             Torus **ksks, uint32_t num_sign_blocks) {

  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;
@@ -845,14 +784,16 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,
  if (num_sign_blocks > 2) {
    auto lut = diff_buffer->reduce_signs_lut;
    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, reduce_two_orderings_function);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        reduce_two_orderings_function);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    while (num_sign_blocks > 2) {
      pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
                  big_lwe_dimension, num_sign_blocks, 4);
      integer_radix_apply_univariate_lookup_table_kb(
-          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsk, ksk,
+          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
          num_sign_blocks / 2, lut);

      auto last_block_signs_b =
@@ -877,14 +818,16 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,

    auto lut = diff_buffer->reduce_signs_lut;
    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, final_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        final_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
                2, 4);
    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
                                                   gpu_count, signs_array_out,
-                                                   signs_b, bsk, ksk, 1, lut);
+                                                   signs_b, bsks, ksks, 1, lut);

  } else {

@@ -895,40 +838,74 @@ __host__ void reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes,

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, final_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        final_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
                                                   gpu_count, signs_array_out,
-                                                   signs_a, bsk, ksk, 1, lut);
+                                                   signs_a, bsks, ksks, 1, lut);
  }
 }

 template <typename Torus>
 void scratch_cuda_apply_univariate_lut_kb(
-    cudaStream_t stream, uint32_t gpu_index, int_radix_lut<Torus> **mem_ptr,
-    Torus *input_lut, uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

-  *mem_ptr = new int_radix_lut<Torus>(stream, gpu_index, params, 1,
-                                      num_radix_blocks, allocate_gpu_memory);
-  cuda_memcpy_async_to_gpu((*mem_ptr)->lut, input_lut,
+  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
+                                      1, num_radix_blocks, allocate_gpu_memory);
+  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
+  // 0
+  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
                           (params.glwe_dimension + 1) *
                               params.polynomial_size * sizeof(Torus),
-                           stream, gpu_index);
+                           streams[0], gpu_indexes[0]);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 }

 template <typename Torus>
 void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                                  uint32_t gpu_count, Torus *radix_lwe_out,
                                  Torus *radix_lwe_in,
-                                  int_radix_lut<Torus> *mem, Torus *ksk,
-                                  void *bsk, uint32_t num_blocks) {
+                                  int_radix_lut<Torus> *mem, Torus **ksks,
+                                  void **bsks, uint32_t num_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsk, ksk,
+      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
      num_blocks, mem);
 }

+template <typename Torus>
+void scratch_cuda_apply_bivariate_lut_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
+                                      1, num_radix_blocks, allocate_gpu_memory);
+  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
+  // 0
+  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
+                           (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(Torus),
+                           streams[0], gpu_indexes[0]);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+}
+
+template <typename Torus>
+void host_apply_bivariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, Torus *radix_lwe_out,
+                                 Torus *radix_lwe_in_1, Torus *radix_lwe_in_2,
+                                 int_radix_lut<Torus> *mem, Torus **ksks,
+                                 void **bsks, uint32_t num_blocks,
+                                 uint32_t shift) {
+
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in_1,
+      radix_lwe_in_2, bsks, ksks, num_blocks, mem, shift);
+}
+
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -66,7 +66,7 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
 * the integer radix multiplication in keyswitch->bootstrap order.
 */
 void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
@@ -87,7 +87,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
  case 8192:
  case 16384:
    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
-        static_cast<cudaStream_t>(stream), gpu_index,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
        allocate_gpu_memory);
    break;
@@ -127,8 +127,9 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 */
 void cuda_integer_mult_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right, void *bsk,
-    void *ksk, int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
+    void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
+    void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size,
+    uint32_t num_blocks) {

  switch (polynomial_size) {
  case 256:
@@ -136,63 +137,56 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 512:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 1024:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 2048:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 4096:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 8192:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  case 16384:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -200,29 +194,30 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
  }
 }

-void cleanup_cuda_integer_mult(void *stream, uint32_t gpu_index,
-                               int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
+                               uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_mul_memory<uint64_t> *mem_ptr =
      (int_mul_memory<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

 void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_in_radix,
-    uint32_t max_num_radix_in_vec, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
                          grouping_factor, message_modulus, carry_modulus);
  scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
      max_num_radix_in_vec, params, allocate_gpu_memory);
 }
@@ -230,7 +225,7 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
-    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks_in_radix) {
+    int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {

  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;

@@ -246,49 +241,43 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 1024:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 2048:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 4096:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 8192:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 16384:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -298,11 +287,12 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
  free(terms_degree);
 }

-void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void *stream,
-                                                    uint32_t gpu_index,
+void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
+                                                    uint32_t *gpu_indexes,
+                                                    uint32_t gpu_count,
                                                    int8_t **mem_ptr_void) {
  int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
      (int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -181,14 +181,13 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
 }
 template <typename Torus>
 __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
+  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
    check_cuda_error(cudaFuncSetAttribute(
        tree_add_chunks<Torus, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
@@ -203,18 +202,17 @@ __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
    check_cuda_error(cudaGetLastError());
  }
  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
-      stream, gpu_index, params, num_blocks_in_radix, max_num_radix_in_vec,
-      allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
+      max_num_radix_in_vec, allocate_gpu_memory);
 }

 template <typename Torus, class params>
 __host__ void host_integer_sum_ciphertexts_vec_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
-    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void *bsk,
-    uint64_t *ksk, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
+    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
+    uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {

-  cudaSetDevice(gpu_indexes[0]);
  auto new_blocks = mem_ptr->new_blocks;
  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;
@@ -258,6 +256,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    dim3 add_grid(ch_amount, num_blocks, 1);
    size_t sm_size = big_lwe_size * sizeof(Torus);

+    cudaSetDevice(gpu_indexes[0]);
    if (sm_size < max_shared_memory)
      tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
          new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
@@ -281,10 +280,10 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    // we allocate luts_message_carry in the host function (instead of scratch)
    // to reduce average memory consumption
    auto luts_message_carry = new int_radix_lut<Torus>(
-        streams[0], gpu_indexes[0], mem_ptr->params, 2, total_count, true);
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2, total_count, true);

-    auto message_acc = luts_message_carry->get_lut(0);
-    auto carry_acc = luts_message_carry->get_lut(1);
+    auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
+    auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);

    // define functions for each accumulator
    auto lut_f_message = [message_modulus](Torus x) -> Torus {
@@ -316,7 +315,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
                             streams[0], gpu_indexes[0]);

-    smart_copy<<<sm_copy_count, 256, 0, streams[0]>>>(
+    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
    check_cuda_error(cudaGetLastError());
@@ -324,24 +323,33 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    if (carry_count > 0)
      cuda_set_value_async<Torus>(
          streams[0], gpu_indexes[0],
-          luts_message_carry->get_lut_indexes(message_count), 1, carry_count);
+          luts_message_carry->get_lut_indexes(gpu_indexes[0], message_count), 1,
+          carry_count);

-    cuda_keyswitch_lwe_ciphertext_vector(
-        streams[0], gpu_indexes[0], small_lwe_vector, lwe_indexes_in,
-        new_blocks, lwe_indexes_in, ksk, polynomial_size * glwe_dimension,
-        lwe_dimension, mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
-        message_count);
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    /// After this keyswitch execution, we need to synchronize the streams
+    /// because the keyswitch and PBS do not operate on the same number of
+    /// inputs
+    execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
+                             lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
+                             polynomial_size * glwe_dimension, lwe_dimension,
+                             mem_ptr->params.ks_base_log,
+                             mem_ptr->params.ks_level, message_count, true);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
    execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
-                       lwe_indexes_out, luts_message_carry->lut,
-                       luts_message_carry->lut_indexes, small_lwe_vector,
-                       lwe_indexes_in, bsk, luts_message_carry->buffer,
+                       lwe_indexes_out, luts_message_carry->lut_vec,
+                       luts_message_carry->lut_indexes_vec, small_lwe_vector,
+                       lwe_indexes_in, bsks, luts_message_carry->buffer,
                       glwe_dimension, lwe_dimension, polynomial_size,
                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
                       mem_ptr->params.grouping_factor, total_count, 2, 0,
-                       max_shared_memory, mem_ptr->params.pbs_type);
-
-    luts_message_carry->release(streams[0], gpu_indexes[0]);
+                       max_shared_memory, mem_ptr->params.pbs_type, true);
+    luts_message_carry->release(streams, gpu_indexes, gpu_count);

    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
    int new_blocks_created = 2 * ch_amount * num_blocks;
@@ -360,18 +368,17 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
                num_blocks);

  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
-                                     radix_lwe_out, mem_ptr->scp_mem, bsk, ksk,
-                                     num_blocks);
+                                     radix_lwe_out, nullptr, nullptr,
+                                     mem_ptr->scp_mem, bsks, ksks, num_blocks);
 }

 template <typename Torus, typename STorus, class params>
 __host__ void host_integer_mult_radix_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
-    uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
+    uint64_t *radix_lwe_right, void **bsks, uint64_t **ksks,
    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
@@ -438,6 +445,7 @@ __host__ void host_integer_mult_radix_kb(
  dim3 grid(lsb_vector_block_count, 1, 1);
  dim3 thds(params::degree / params::opt, 1, 1);

+  cudaSetDevice(gpu_indexes[0]);
  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
      vector_lsb_rhs, vector_msb_rhs, num_blocks);
@@ -445,13 +453,14 @@ __host__ void host_integer_mult_radix_kb(

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
-      vector_result_sb, bsk, ksk, total_block_count, luts_array,
+      vector_result_sb, bsks, ksks, total_block_count, luts_array,
      luts_array->params.message_modulus);

  vector_result_lsb = &block_mul_res[0];
  vector_result_msb = &block_mul_res[lsb_vector_block_count *
                                     (polynomial_size * glwe_dimension + 1)];

+  cudaSetDevice(gpu_indexes[0]);
  fill_radix_from_lsb_msb<Torus, params>
      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
         streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
@@ -474,18 +483,17 @@ __host__ void host_integer_mult_radix_kb(

  host_integer_sum_ciphertexts_vec_kb<Torus, params>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
-      terms_degree, bsk, ksk, mem_ptr->sum_ciphertexts_mem, num_blocks,
+      terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
      2 * num_blocks);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
-    cudaStream_t stream, uint32_t gpu_index, int_mul_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
+  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
    check_cuda_error(cudaFuncSetAttribute(
        tree_add_chunks<Torus, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
@@ -500,7 +508,7 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
    check_cuda_error(cudaGetLastError());
  }

-  *mem_ptr = new int_mul_memory<Torus>(stream, gpu_index, params,
+  *mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
                                       num_radix_blocks, allocate_gpu_memory);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -12,12 +12,12 @@ void cuda_negate_integer_radix_ciphertext_64_inplace(
 }

 void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -25,7 +25,7 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }
@@ -33,7 +33,7 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64(
 void cuda_integer_radix_overflowing_sub_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
-    void *radix_lwe_right, int8_t *mem_ptr, void *bsk, void *ksk,
+    void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks,
    uint32_t num_blocks) {

  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
@@ -45,8 +45,8 @@ void cuda_integer_radix_overflowing_sub_kb_64(
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 1024:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
@@ -54,8 +54,8 @@ void cuda_integer_radix_overflowing_sub_kb_64(
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 2048:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
@@ -63,8 +63,8 @@ void cuda_integer_radix_overflowing_sub_kb_64(
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 4096:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
@@ -72,8 +72,8 @@ void cuda_integer_radix_overflowing_sub_kb_64(
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 8192:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
@@ -81,8 +81,8 @@ void cuda_integer_radix_overflowing_sub_kb_64(
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 16384:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
@@ -90,8 +90,8 @@ void cuda_integer_radix_overflowing_sub_kb_64(
        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  default:
    PANIC("Cuda error (integer overflowing sub): unsupported polynomial size. "
@@ -99,11 +99,12 @@ void cuda_integer_radix_overflowing_sub_kb_64(
  }
 }

-void cleanup_cuda_integer_radix_overflowing_sub(void *stream,
-                                                uint32_t gpu_index,
+void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
+                                                uint32_t *gpu_indexes,
+                                                uint32_t gpu_count,
                                                int8_t **mem_ptr_void) {
  int_overflowing_sub_memory<uint64_t> *mem_ptr =
      (int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -90,20 +90,19 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,

 template <typename Torus>
 __host__ void scratch_cuda_integer_overflowing_sub_kb(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_overflowing_sub_memory<Torus> **mem_ptr, uint32_t num_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  *mem_ptr = new int_overflowing_sub_memory<Torus>(
-      stream, gpu_index, params, num_blocks, allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }

 template <typename Torus, class params>
 __host__ void host_integer_overflowing_sub_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
-    Torus *radix_lwe_right, void *bsk, uint64_t *ksk,
+    Torus *radix_lwe_right, void **bsks, uint64_t **ksks,
    int_overflowing_sub_memory<uint64_t> *mem_ptr, uint32_t num_blocks) {

  auto radix_params = mem_ptr->params;
@@ -116,7 +115,7 @@ __host__ void host_integer_overflowing_sub_kb(

  host_propagate_single_sub_borrow<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out,
-      mem_ptr->borrow_prop_mem, bsk, ksk, num_blocks);
+      mem_ptr->borrow_prop_mem, bsks, ksks, num_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -3,7 +3,7 @@
 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
-    uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks,
    uint32_t lwe_ciphertext_count, BITOP_TYPE op) {

  host_integer_radix_scalar_bitop_kb<uint64_t>(
@@ -11,6 +11,6 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_input),
      static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      lwe_ciphertext_count, op);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -8,10 +8,9 @@ template <typename Torus>
 __host__ void host_integer_radix_scalar_bitop_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks,
-    uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks, BITOP_TYPE op) {
+    uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks, BITOP_TYPE op) {

-  cudaSetDevice(gpu_indexes[0]);
  auto lut = mem_ptr->lut;
  auto params = lut->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -31,13 +30,14 @@ __host__ void host_integer_radix_scalar_bitop_kb(
  } else {
    // We have all possible LUTs pre-computed and we use the decomposed scalar
    // as index to recover the right one
-    cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
-                                 num_clear_blocks * sizeof(Torus), streams[0],
-                                 gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(gpu_indexes[0], 0),
+                                 clear_blocks, num_clear_blocks * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsk,
-        ksk, num_clear_blocks, lut);
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
+        ksks, num_clear_blocks, lut);

    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -3,7 +3,7 @@
 void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
-    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count,
+    int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count,
    uint32_t num_scalar_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
@@ -15,8 +15,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
+        static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
+        (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
    break;
  case GT:
  case GE:
@@ -27,7 +27,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(scalar_blocks), buffer,
-        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
+        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
        lwe_ciphertext_count, num_scalar_blocks);
    break;
  case MAX:
@@ -36,8 +36,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
+        static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
+        (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -9,10 +9,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
    int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
+    std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -49,7 +48,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // means scalar is zero
    host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
                                    mem_ptr->tmp_lwe_array_out, lwe_array_in,
-                                    mem_ptr, bsk, ksk, total_num_radix_blocks,
+                                    mem_ptr, bsks, ksks, total_num_radix_blocks,
                                    mem_ptr->is_zero_lut);

    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
@@ -60,12 +59,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        scalar_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out,
-        mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
+        mem_ptr->tmp_lwe_array_out, bsks, ksks, 1, lut);

  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
    // We have to handle both part of the work described above
@@ -79,9 +80,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
+    auto lsb_streams = mem_ptr->lsb_streams;
+    auto msb_streams = mem_ptr->msb_streams;
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }

 #pragma omp parallel sections
    {
@@ -93,9 +97,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        Torus *lhs = diff_buffer->tmp_packed_left;
        Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
                    total_num_scalar_blocks, message_modulus);

        // From this point we have half number of blocks
@@ -108,29 +112,31 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        // - 2 if lhs > rhs

        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
-                                       comparisons, lhs, rhs, mem_ptr, bsk, ksk,
-                                       num_lsb_radix_blocks);
+        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                       comparisons, lhs, rhs, mem_ptr, bsks,
+                                       ksks, num_lsb_radix_blocks);

        // Reduces a vec containing radix blocks that encrypts a sign
        // (inferior, equal, superior) to one single radix block containing the
        // final sign
-        tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out,
-                            comparisons, mem_ptr->diff_buffer->tree_buffer,
-                            mem_ptr->identity_lut_f, bsk, ksk,
-                            num_lsb_radix_blocks);
+        tree_sign_reduction(
+            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+            mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
+            ksks, num_lsb_radix_blocks);
      }
 #pragma omp section
      {
        //////////////
        // msb
        host_compare_with_zero_equality(
-            &msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, msb, mem_ptr,
-            bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut);
+            msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb,
+            mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
      }
    }
-    cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
-    cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+    }

    //////////////
    // Reduce the two blocks into one final
@@ -145,12 +151,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_bivariate_lookup_table_kb(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
-        lwe_array_msb_out, bsk, ksk, 1, lut, lut->params.message_modulus);
+        lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);

  } else {
    // We only have to do the regular comparison
@@ -177,7 +185,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 2 if lhs > rhs
    auto comparisons = mem_ptr->tmp_lwe_array_out;
    scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
-                                   lhs, rhs, mem_ptr, bsk, ksk,
+                                   lhs, rhs, mem_ptr, bsks, ksks,
                                   num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
@@ -185,7 +193,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // final sign
    tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
                        comparisons, mem_ptr->diff_buffer->tree_buffer,
-                        sign_handler_f, bsk, ksk, num_lsb_radix_blocks);
+                        sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
  }
 }

@@ -194,7 +202,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
    int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
+    std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

  cudaSetDevice(gpu_indexes[0]);
@@ -235,7 +243,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
    host_compare_with_zero_equality(
        streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
-        mem_ptr, bsk, ksk, total_num_radix_blocks, mem_ptr->is_zero_lut);
+        mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
    Torus *sign_block =
        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;

@@ -276,12 +284,14 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->lut, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_bivariate_lookup_table_kb(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
-        sign_block, bsk, ksk, 1, lut, lut->params.message_modulus);
+        sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);

  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
    // We have to handle both part of the work described above
@@ -295,9 +305,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
+    auto lsb_streams = mem_ptr->lsb_streams;
+    auto msb_streams = mem_ptr->msb_streams;
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }

 #pragma omp parallel sections
    {
@@ -309,9 +321,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        Torus *lhs = diff_buffer->tmp_packed_left;
        Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
                    total_num_scalar_blocks, message_modulus);

        // From this point we have half number of blocks
@@ -324,17 +336,17 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        // - 2 if lhs > rhs

        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
-                                       comparisons, lhs, rhs, mem_ptr, bsk, ksk,
-                                       num_lsb_radix_blocks);
+        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                       comparisons, lhs, rhs, mem_ptr, bsks,
+                                       ksks, num_lsb_radix_blocks);

        // Reduces a vec containing radix blocks that encrypts a sign
        // (inferior, equal, superior) to one single radix block containing the
        // final sign
-        tree_sign_reduction(&lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out,
-                            comparisons, mem_ptr->diff_buffer->tree_buffer,
-                            mem_ptr->identity_lut_f, bsk, ksk,
-                            num_lsb_radix_blocks);
+        tree_sign_reduction(
+            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+            mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
+            ksks, num_lsb_radix_blocks);
      }
 #pragma omp section
      {
@@ -343,8 +355,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        // We remove the last block (which is the sign)
        Torus *are_all_msb_zeros = lwe_array_msb_out;
        host_compare_with_zero_equality(
-            &msb_stream, &gpu_indexes[0], 1, are_all_msb_zeros, msb, mem_ptr,
-            bsk, ksk, num_msb_radix_blocks, mem_ptr->is_zero_lut);
+            msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb,
+            mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);

        auto sign_bit_pos = (int)log2(message_modulus) - 1;

@@ -372,24 +384,28 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

        auto signed_msb_lut = mem_ptr->signed_msb_lut;
        generate_device_accumulator_bivariate<Torus>(
-            msb_stream, gpu_indexes[0], signed_msb_lut->lut,
-            params.glwe_dimension, params.polynomial_size,
-            params.message_modulus, params.carry_modulus, lut_f);
+            msb_streams[0], gpu_indexes[0],
+            signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
+            params.polynomial_size, params.message_modulus,
+            params.carry_modulus, lut_f);
+        signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

        Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
        integer_radix_apply_bivariate_lookup_table_kb(
-            &msb_stream, &gpu_indexes[0], 1, lwe_array_msb_out, sign_block,
-            are_all_msb_zeros, bsk, ksk, 1, signed_msb_lut,
+            msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
+            are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
            signed_msb_lut->params.message_modulus);
      }
    }
-    cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
-    cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+    }

    //////////////
    // Reduce the two blocks into one final
    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_lsb_out, mem_ptr, sign_handler_f, bsk, ksk, 2);
+                 lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);

  } else {
    // We only have to do the regular comparison
@@ -397,9 +413,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // total_num_radix_blocks == total_num_scalar_blocks
    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;

-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+    auto lsb_streams = mem_ptr->lsb_streams;
+    auto msb_streams = mem_ptr->msb_streams;

    auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_sign_out =
@@ -412,10 +430,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        Torus *lhs = diff_buffer->tmp_packed_left;
        Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_stream, gpu_indexes[0], lhs, lwe_array_in,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                    big_lwe_dimension, num_lsb_radix_blocks - 1,
                    message_modulus);
-        pack_blocks(lsb_stream, gpu_indexes[0], rhs, scalar_blocks, 0,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
                    num_lsb_radix_blocks - 1, message_modulus);

        // From this point we have half number of blocks
@@ -425,9 +443,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        // - 0 if lhs < rhs
        // - 1 if lhs == rhs
        // - 2 if lhs > rhs
-        scalar_compare_radix_blocks_kb(&lsb_stream, &gpu_indexes[0], 1,
-                                       lwe_array_ct_out, lhs, rhs, mem_ptr, bsk,
-                                       ksk, num_lsb_radix_blocks);
+        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                       lwe_array_ct_out, lhs, rhs, mem_ptr,
+                                       bsks, ksks, num_lsb_radix_blocks);
      }
 #pragma omp section
      {
@@ -437,24 +455,26 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
            scalar_blocks + (total_num_scalar_blocks - 1);

        auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-        create_trivial_radix(msb_stream, gpu_indexes[0], trivial_sign_block,
+        create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
                             scalar_sign_block, big_lwe_dimension, 1, 1,
                             message_modulus, carry_modulus);

        integer_radix_apply_bivariate_lookup_table_kb(
-            &msb_stream, &gpu_indexes[0], 1, lwe_array_sign_out,
-            encrypted_sign_block, trivial_sign_block, bsk, ksk, 1,
+            msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
+            encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
            mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
      }
    }
-    cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
-    cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+    }

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
-                 lwe_array_ct_out, mem_ptr, sign_handler_f, bsk, ksk,
+                 lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
                 num_lsb_radix_blocks + 1);
  }
 }
@@ -463,7 +483,7 @@ template <typename Torus>
 __host__ void integer_radix_signed_scalar_maxmin_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

  cudaSetDevice(gpu_indexes[0]);
@@ -475,7 +495,7 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
  auto sign = mem_ptr->tmp_lwe_array_out;
  integer_radix_signed_scalar_difference_check_kb(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
-      mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
+      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);

  // There is no optimized CMUX for scalars, so we convert to a trivial
@@ -490,9 +510,10 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
+  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             sign, lwe_array_left, lwe_array_right,
+                             mem_ptr->cmux_buffer, bsks, ksks,
+                             total_num_radix_blocks);
 }

 template <typename Torus>
@@ -500,19 +521,19 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
    int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
+    std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
    integer_radix_signed_scalar_difference_check_kb(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk,
+        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
  } else {
    integer_radix_unsigned_scalar_difference_check_kb(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, sign_handler_f, bsk, ksk,
+        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
        total_num_radix_blocks, total_num_scalar_blocks);
  }
 }
@@ -521,32 +542,30 @@ template <typename Torus>
 __host__ void host_integer_radix_signed_scalar_maxmin_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
    integer_radix_signed_scalar_maxmin_kb(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks,
+        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
        total_num_scalar_blocks);
  } else {
    integer_radix_unsigned_scalar_maxmin_kb(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
-        scalar_blocks, mem_ptr, bsk, ksk, total_num_radix_blocks,
+        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
        total_num_scalar_blocks);
  }
 }

 template <typename Torus>
-__host__ void
-scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
-                               uint32_t gpu_count, Torus *lwe_array_out,
-                               Torus *lwe_array_in, Torus *scalar_blocks,
-                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                               Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void scalar_compare_radix_blocks_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -579,8 +598,8 @@ scalar_compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
  // Apply LUT to compare to 0
  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb(
-      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsk,
-      ksk, num_radix_blocks, sign_lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
+      ksks, num_radix_blocks, sign_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
@@ -594,10 +613,9 @@ template <typename Torus>
 __host__ void host_integer_radix_scalar_maxmin_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

  // Calculates the difference sign between the ciphertext and the scalar
@@ -607,7 +625,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  auto sign = mem_ptr->tmp_lwe_array_out;
  host_integer_radix_scalar_difference_check_kb(
      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
-      mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
+      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);

  // There is no optimized CMUX for scalars, so we convert to a trivial
@@ -624,7 +642,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  // CMUX for Max or Min
  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
+                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
                             total_num_radix_blocks);
 }

@@ -632,10 +650,9 @@ template <typename Torus>
 __host__ void host_integer_radix_scalar_equality_check_kb(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -662,10 +679,12 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
  auto lwe_array_msb_out =
      lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;

-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }

-  auto lsb_stream = mem_ptr->lsb_stream;
-  auto msb_stream = mem_ptr->msb_stream;
+  auto lsb_streams = mem_ptr->lsb_streams;
+  auto msb_streams = mem_ptr->msb_streams;

 #pragma omp parallel sections
  {
@@ -677,19 +696,21 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
        auto packed_scalar =
            packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;

-        pack_blocks(lsb_stream, gpu_indexes[0], packed_blocks, lsb,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, gpu_indexes[0], packed_scalar, scalar_blocks, 0,
-                    num_scalar_blocks, message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar,
+                    scalar_blocks, 0, num_scalar_blocks, message_modulus);

-        cuda_memcpy_async_gpu_to_gpu(scalar_comparison_luts->lut_indexes,
-                                     packed_scalar,
-                                     num_halved_scalar_blocks * sizeof(Torus),
-                                     lsb_stream, gpu_indexes[0]);
+        cuda_memcpy_async_gpu_to_gpu(
+            scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
+            packed_scalar, num_halved_scalar_blocks * sizeof(Torus),
+            lsb_streams[0], gpu_indexes[0]);
+        scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

        integer_radix_apply_univariate_lookup_table_kb(
-            &lsb_stream, &gpu_indexes[0], 1, lwe_array_lsb_out, packed_blocks,
-            bsk, ksk, num_halved_lsb_radix_blocks, scalar_comparison_luts);
+            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
+            packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks,
+            scalar_comparison_luts);
      }
    }
 #pragma omp section
@@ -709,27 +730,29 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
          PANIC("Cuda error: integer operation not supported")
        }

-        host_compare_with_zero_equality(&msb_stream, &gpu_indexes[0], 1,
-                                        lwe_array_msb_out, msb, mem_ptr, bsk,
-                                        ksk, num_msb_radix_blocks, msb_lut);
+        host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
+                                        lwe_array_msb_out, msb, mem_ptr, bsks,
+                                        ksks, num_msb_radix_blocks, msb_lut);
      }
    }
  }

-  cuda_synchronize_stream(lsb_stream, gpu_indexes[0]);
-  cuda_synchronize_stream(msb_stream, gpu_indexes[0]);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+    cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+  }

  switch (mem_ptr->op) {
  case COMPARISON_TYPE::EQ:
    are_all_comparisons_block_true(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
-        mem_ptr, bsk, ksk,
+        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  case COMPARISON_TYPE::NE:
    is_at_least_one_comparisons_block_true(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
-        mem_ptr, bsk, ksk,
+        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -1,11 +1,12 @@
 #include "integer/scalar_mul.cuh"

 void scratch_cuda_integer_scalar_mul_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -13,7 +14,7 @@ void scratch_cuda_integer_scalar_mul_kb_64(
                          grouping_factor, message_modulus, carry_modulus);

  scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }
@@ -21,7 +22,7 @@ void scratch_cuda_integer_scalar_mul_kb_64(
 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
    uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem,
-    void *bsk, void *ksk, uint32_t lwe_dimension, uint32_t polynomial_size,
+    void **bsks, void **ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) {

  switch (polynomial_size) {
@@ -30,54 +31,54 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 1024:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 2048:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 4096:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 8192:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 16384:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  default:
    PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
@@ -85,12 +86,13 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
  }
 }

-void cleanup_cuda_integer_radix_scalar_mul(void *stream, uint32_t gpu_index,
+void cleanup_cuda_integer_radix_scalar_mul(void **streams,
+                                           uint32_t *gpu_indexes,
+                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void) {

-  cudaSetDevice(gpu_index);
  int_scalar_mul_buffer<uint64_t> *mem_ptr =
      (int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -29,13 +29,12 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,

 template <typename T>
 __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
-    cudaStream_t stream, uint32_t gpu_index, int_scalar_mul_buffer<T> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
-  if (sm_size < cuda_get_max_shared_memory(gpu_index)) {
+  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
    check_cuda_error(cudaFuncSetAttribute(
        tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
        sm_size));
@@ -50,22 +49,22 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
    check_cuda_error(cudaGetLastError());
  }

-  *mem_ptr = new int_scalar_mul_buffer<T>(
-      stream, gpu_index, params, num_radix_blocks, allocate_gpu_memory);
+  *mem_ptr =
+      new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
+                                   num_radix_blocks, allocate_gpu_memory);
 }

 template <typename T, class params>
 __host__ void host_integer_scalar_mul_radix(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set,
-    int_scalar_mul_buffer<T> *mem, void *bsk, T *ksk,
+    int_scalar_mul_buffer<T> *mem, void **bsks, T **ksks,
    uint32_t input_lwe_dimension, uint32_t message_modulus,
    uint32_t num_radix_blocks, uint32_t num_scalars) {

  if (num_radix_blocks == 0 | num_scalars == 0)
    return;

-  cudaSetDevice(gpu_indexes[0]);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  uint32_t lwe_size = input_lwe_dimension + 1;
@@ -84,7 +83,7 @@ __host__ void host_integer_scalar_mul_radix(
                                   streams[0], gpu_indexes[0]);
      host_integer_radix_logical_scalar_shift_kb_inplace(
          streams, gpu_indexes, gpu_count, ptr, shift_amount,
-          mem->logical_scalar_shift_buffer, bsk, ksk, num_radix_blocks);
+          mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
    } else {
      // create trivial assign for value = 0
      cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, streams[0],
@@ -120,8 +119,8 @@ __host__ void host_integer_scalar_mul_radix(
    }
    host_integer_sum_ciphertexts_vec_kb<T, params>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
-        terms_degree, bsk, ksk, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
-        j);
+        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
+        num_radix_blocks, j);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -1,12 +1,12 @@
 #include "scalar_rotate.cuh"

 void scratch_cuda_integer_radix_scalar_rotate_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -15,28 +15,30 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
      shift_type, allocate_gpu_memory);
 }

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t n, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
+    uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {

  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), n,
-      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

-void cleanup_cuda_integer_radix_scalar_rotate(void *stream, uint32_t gpu_index,
+void cleanup_cuda_integer_radix_scalar_rotate(void **streams,
+                                              uint32_t *gpu_indexes,
+                                              uint32_t gpu_count,
                                              int8_t **mem_ptr_void) {

-  cudaSetDevice(gpu_index);
  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -13,14 +13,13 @@

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
    int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
-      stream, gpu_index, shift_type, params, num_radix_blocks,
+      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
      allocate_gpu_memory);
 }

@@ -28,9 +27,7 @@ template <typename Torus>
 __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer<Torus> *mem,
-    void *bsk, Torus *ksk, uint32_t num_blocks) {
-
-  cudaSetDevice(gpu_indexes[0]);
+    void **bsks, Torus **ksks, uint32_t num_blocks) {

  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
@@ -60,6 +57,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  // block_count blocks will be used in the grid
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
+    // rotate right as the blocks are from LSB to MSB
    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
                                   rotated_buffer, lwe_array, rotations,
                                   num_blocks, big_lwe_size);
@@ -80,11 +78,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
-        giver_blocks, bsk, ksk, num_blocks, lut_bivariate,
+        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
        lut_bivariate->params.message_modulus);

  } else {
-    // left shift
+    // rotate left as the blocks are from LSB to MSB
    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
                                  rotated_buffer, lwe_array, rotations,
                                  num_blocks, big_lwe_size);
@@ -104,7 +102,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
-        giver_blocks, bsk, ksk, num_blocks, lut_bivariate,
+        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
        lut_bivariate->params.message_modulus);
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -1,12 +1,12 @@
 #include "scalar_shifts.cuh"

 void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -15,7 +15,7 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
      shift_type, allocate_gpu_memory);
 }
@@ -26,23 +26,23 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
 /// rotations - 1 The remaining blocks are padded with zeros
 void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
    uint32_t num_blocks) {

  host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), shift,
-      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

 void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -51,7 +51,7 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
      params, shift_type, allocate_gpu_memory);
 }
@@ -65,34 +65,34 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
 /// zeros as would be done in the logical shift.
 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    uint32_t shift, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
    uint32_t num_blocks) {

  host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), shift,
-      (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

-void cleanup_cuda_integer_radix_logical_scalar_shift(void *stream,
-                                                     uint32_t gpu_index,
+void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
+                                                     uint32_t *gpu_indexes,
+                                                     uint32_t gpu_count,
                                                     int8_t **mem_ptr_void) {

-  cudaSetDevice(gpu_index);
  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

-void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void *stream,
-                                                        uint32_t gpu_index,
+void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams,
+                                                        uint32_t *gpu_indexes,
+                                                        uint32_t gpu_count,
                                                        int8_t **mem_ptr_void) {

-  cudaSetDevice(gpu_index);
  int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -14,14 +14,13 @@

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
    int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
-      stream, gpu_index, shift_type, params, num_radix_blocks,
+      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
      allocate_gpu_memory);
 }

@@ -29,11 +28,9 @@ template <typename Torus>
 __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array, uint32_t shift,
-    int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
+    int_logical_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
    uint32_t num_blocks) {

-  cudaSetDevice(gpu_indexes[0]);
-
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -59,10 +56,11 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  // rotate right all the blocks in radix ciphertext
  // copy result in new buffer
-  // 256 threads are used in every block
+  // 1024 threads are used in every block
  // block_count blocks will be used in the grid
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
+    // rotate right as the blocks are from LSB to MSB
    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
                                   rotated_buffer, lwe_array, rotations,
                                   num_blocks, big_lwe_size);
@@ -86,7 +84,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, partial_current_blocks,
-        partial_current_blocks, partial_previous_blocks, bsk, ksk,
+        partial_current_blocks, partial_previous_blocks, bsks, ksks,
        partial_block_count, lut_bivariate,
        lut_bivariate->params.message_modulus);

@@ -116,7 +114,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, partial_current_blocks,
-        partial_current_blocks, partial_next_blocks, bsk, ksk,
+        partial_current_blocks, partial_next_blocks, bsks, ksks,
        partial_block_count, lut_bivariate,
        lut_bivariate->params.message_modulus);
  }
@@ -124,14 +122,13 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {

-  cudaSetDevice(gpu_index);
  *mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
-      stream, gpu_index, shift_type, params, num_radix_blocks,
+      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
      allocate_gpu_memory);
 }

@@ -139,7 +136,7 @@ template <typename Torus>
 __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array, uint32_t shift,
-    int_arithmetic_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
+    int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
    uint32_t num_blocks) {

  cudaSetDevice(gpu_indexes[0]);
@@ -214,26 +211,28 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
    if (shift_within_block != 0 && rotations != num_blocks) {
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, partial_current_blocks,
-          partial_current_blocks, partial_next_blocks, bsk, ksk,
+          partial_current_blocks, partial_next_blocks, bsks, ksks,
          partial_block_count, lut_bivariate,
          lut_bivariate->params.message_modulus);
    }
    // Since our CPU threads will be working on different streams we shall
    // assert the work in the main stream is completed
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
 #pragma omp parallel sections
    {
      // All sections may be executed in parallel
 #pragma omp section
      {
        integer_radix_apply_univariate_lookup_table_kb(
-            &mem->local_stream_1, &gpu_indexes[0], 1, padding_block,
-            last_block_copy, bsk, ksk, 1, lut_univariate_padding_block);
+            mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
+            last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
        // Replace blocks 'pulled' from the left with the correct padding block
        for (uint i = 0; i < rotations; i++) {
          cuda_memcpy_async_gpu_to_gpu(
              lwe_array + (num_blocks - rotations + i) * big_lwe_size,
-              padding_block, big_lwe_size_bytes, mem->local_stream_1,
+              padding_block, big_lwe_size_bytes, mem->local_streams_1[0],
              gpu_indexes[0]);
        }
      }
@@ -241,13 +240,15 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      {
        if (shift_within_block != 0 && rotations != num_blocks) {
          integer_radix_apply_univariate_lookup_table_kb(
-              &mem->local_stream_2, &gpu_indexes[0], 1, last_block,
-              last_block_copy, bsk, ksk, 1, lut_univariate_shift_last_block);
+              mem->local_streams_2, gpu_indexes, gpu_count, last_block,
+              last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
        }
      }
    }
-    cuda_synchronize_stream(mem->local_stream_1, gpu_indexes[0]);
-    cuda_synchronize_stream(mem->local_stream_2, gpu_indexes[0]);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
+    }

  } else {
    PANIC("Cuda error (scalar shift): left scalar shift is never of the "
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
@@ -1,13 +1,13 @@
 #include "shift_and_rotate.cuh"

 void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
-    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    bool is_signed, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -15,29 +15,29 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks, params,
      shift_type, is_signed, allocate_gpu_memory);
 }

 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
-    void *lwe_shift, int8_t *mem_ptr, void *bsk, void *ksk,
+    void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks,
    uint32_t num_blocks) {

  host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_shift),
-      (int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

-void cleanup_cuda_integer_radix_shift_and_rotate(void *stream,
-                                                 uint32_t gpu_index,
+void cleanup_cuda_integer_radix_shift_and_rotate(void **streams,
+                                                 uint32_t *gpu_indexes,
+                                                 uint32_t gpu_count,
                                                 int8_t **mem_ptr_void) {
-  cudaSetDevice(gpu_index);
  int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
      (int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -14,21 +14,20 @@

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
-    cudaStream_t stream, uint32_t gpu_index,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    int_shift_and_rotate_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
    int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
  *mem_ptr = new int_shift_and_rotate_buffer<Torus>(
-      stream, gpu_index, shift_type, is_signed, params, num_radix_blocks,
-      allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
+      num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
    Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer<Torus> *mem,
-    void *bsk, Torus *ksk, uint32_t num_radix_blocks) {
+    void **bsks, Torus **ksks, uint32_t num_radix_blocks) {
  uint32_t bits_per_block = std::log2(mem->params.message_modulus);
  uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
  if (total_nb_bits == 0)
@@ -42,8 +41,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

  // Extract all bits
  auto bits = mem->tmp_bits;
-  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsk,
-                        ksk, num_radix_blocks, bits_per_block,
+  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
+                        ksks, num_radix_blocks, bits_per_block,
                        mem->bit_extract_luts);

  // Extract shift bits
@@ -64,7 +63,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  // so that it is already aligned to the correct position of the cmux input
  // and we reduce noise growth
  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
-                        bsk, ksk, 1, max_num_bits_that_tell_shift,
+                        bsks, ksks, 1, max_num_bits_that_tell_shift,
                        mem->bit_extract_luts_with_offset_2);

  // If signed, do an "arithmetic shift" by padding with the sign bit
@@ -90,6 +89,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    auto rotations = 1 << d;
    switch (mem->shift_type) {
    case LEFT_SHIFT:
+      // rotate right as the blocks are from LSB to MSB
      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
                                     rotated_input, input_bits_b, rotations,
                                     total_nb_bits, big_lwe_size);
@@ -104,6 +104,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
                          streams[0], gpu_indexes[0]);
      break;
    case RIGHT_SHIFT:
+      // rotate left as the blocks are from LSB to MSB
      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
                                    rotated_input, input_bits_b, rotations,
                                    total_nb_bits, big_lwe_size);
@@ -119,11 +120,13 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
            rotations * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
      break;
    case LEFT_ROTATE:
+      // rotate right as the blocks are from LSB to MSB
      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
                                     rotated_input, input_bits_b, rotations,
                                     total_nb_bits, big_lwe_size);
      break;
    case RIGHT_ROTATE:
+      // rotate left as the blocks are from LSB to MSB
      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
                                    rotated_input, input_bits_b, rotations,
                                    total_nb_bits, big_lwe_size);
@@ -150,7 +153,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // we have
    // control_bit|b|a
    integer_radix_apply_univariate_lookup_table_kb(
-        streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsk, ksk,
+        streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
        total_nb_bits, mux_lut);
  }

@@ -188,7 +191,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
    integer_radix_apply_univariate_lookup_table_kb(
-        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsk, ksk,
+        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
        num_radix_blocks, cleaning_lut);
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -7,7 +7,7 @@
 #endif

 #include "device.h"
-#include "helper.h"
+#include "helper_multi_gpu.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 #include <stdio.h>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -65,7 +65,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
                                                 double2 *dest, ST *src,
                                                 uint32_t polynomial_size,
                                                 uint32_t total_polynomials) {
-
  cudaSetDevice(gpu_index);
  int shared_memory_size = sizeof(double) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu
@@ -20,7 +20,7 @@ get_join_buffer_element(int level_id, int glwe_id, grid_group &group,
 template <>
 __device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) {
  if (support_dsm)
-    return cluster.block_rank();
+    return cluster.block_index().y;
  else
    return blockIdx.y;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -7,6 +7,7 @@
 #include "programmable_bootstrap_multibit.h"

 #include "cooperative_groups.h"
+#include "helper_multi_gpu.h"

 using namespace cooperative_groups;
 namespace cg = cooperative_groups;
@@ -119,16 +120,20 @@ __device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
 template <typename Torus>
 void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
                 uint32_t gpu_count, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, Torus *lut_vector,
-                 Torus *lut_vector_indexes, Torus *lwe_array_in,
-                 Torus *lwe_input_indexes, void *bootstrapping_key,
-                 int8_t *pbs_buffer, uint32_t glwe_dimension,
+                 Torus *lwe_output_indexes, std::vector<Torus *> lut_vec,
+                 std::vector<Torus *> lut_indexes_vec, Torus *lwe_array_in,
+                 Torus *lwe_input_indexes, void **bootstrapping_keys,
+                 std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
                 uint32_t lwe_dimension, uint32_t polynomial_size,
                 uint32_t base_log, uint32_t level_count,
                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
                 uint32_t num_luts, uint32_t lwe_idx,
-                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
-  auto num_inputs_on_gpu = input_lwe_ciphertext_count / gpu_count;
+                 uint32_t max_shared_memory, PBS_TYPE pbs_type,
+                 bool sync_streams = true) {
+  auto active_gpu_count =
+      get_active_gpu_count(input_lwe_ciphertext_count, gpu_count);
+  if (sync_streams)
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
@@ -136,14 +141,24 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
    case MULTI_BIT:
      PANIC("Error: 32-bit multibit PBS is not supported.\n")
    case CLASSICAL:
-      cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-          streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
-          lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-          lwe_idx, max_shared_memory);
+#pragma omp parallel for num_threads(active_gpu_count)
+      for (uint i = 0; i < active_gpu_count; i++) {
+        int num_inputs_on_gpu =
+            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+        int gpu_offset =
+            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
+        auto d_lut_vector_indexes =
+            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
+            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
+            polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
+            lwe_idx, max_shared_memory, gpu_offset);
+      }
      break;
    default:
+      PANIC("Error: unsupported cuda PBS type.")
      break;
    }
    break;
@@ -153,20 +168,39 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
    case MULTI_BIT:
      if (grouping_factor == 0)
        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
-      cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-          streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
-          lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, grouping_factor, base_log, level_count,
-          num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory);
+#pragma omp parallel for num_threads(active_gpu_count)
+      for (uint i = 0; i < active_gpu_count; i++) {
+        int num_inputs_on_gpu =
+            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+        int gpu_offset =
+            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
+        auto d_lut_vector_indexes =
+            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+        cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
+            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
+            polynomial_size, grouping_factor, base_log, level_count,
+            num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory,
+            gpu_offset);
+      }
      break;
    case CLASSICAL:
-      cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-          streams[0], gpu_indexes[0], lwe_array_out, lwe_output_indexes,
-          lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
-          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-          lwe_idx, max_shared_memory);
+#pragma omp parallel for num_threads(active_gpu_count)
+      for (uint i = 0; i < active_gpu_count; i++) {
+        int num_inputs_on_gpu =
+            get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+        int gpu_offset =
+            get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
+        auto d_lut_vector_indexes =
+            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
+            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
+            polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
+            lwe_idx, max_shared_memory, gpu_offset);
+      }
      break;
    default:
      PANIC("Error: unsupported cuda PBS type.")
@@ -176,6 +210,11 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
    PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
          "moduli are supported.")
  }
+
+  if (sync_streams)
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
 }

 template <typename Torus>
@@ -186,8 +225,6 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
                         uint32_t input_lwe_ciphertext_count,
                         uint32_t max_shared_memory, PBS_TYPE pbs_type,
                         bool allocate_gpu_memory) {
-  if (gpu_index != 0)
-    PANIC("GPU error (pbs): all memory has to reside in GPU 0.")
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -158,7 +158,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t max_shared_memory, uint32_t gpu_offset) {

  if (base_log > 32)
    PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -172,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 512:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
@@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 1024:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
@@ -190,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 2048:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
@@ -199,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 4096:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
@@ -208,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 8192:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
@@ -217,7 +217,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 16384:
    host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
@@ -226,7 +226,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
        (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  default:
    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -307,7 +307,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t max_shared_memory, uint32_t gpu_offset) {

  if (base_log > 64)
    PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -321,7 +321,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 512:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
@@ -330,7 +330,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 1024:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
@@ -339,7 +339,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 2048:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
@@ -348,7 +348,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 4096:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
@@ -357,7 +357,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 8192:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
@@ -366,7 +366,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  case 16384:
    host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
@@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
        (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
        glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
    break;
  default:
    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -52,7 +52,7 @@ __global__ void device_programmable_bootstrap_amortized(
    double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t lwe_idx,
-    size_t device_memory_size_per_sample) {
+    size_t device_memory_size_per_sample, uint32_t gpu_offset) {
  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
  // much faster than global memory
@@ -79,7 +79,8 @@ __global__ void device_programmable_bootstrap_amortized(
                      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);

  auto block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] *
+                    (lwe_dimension + 1)];
  Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
                  (glwe_dimension + 1)];
@@ -197,7 +198,7 @@ __global__ void device_programmable_bootstrap_amortized(
  }

  auto block_lwe_array_out =
-      &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+      &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
                     (glwe_dimension * polynomial_size + 1)];

  // The blind rotation for this block is over
@@ -257,8 +258,8 @@ __host__ void scratch_programmable_bootstrap_amortized(
    uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);

+  cudaSetDevice(gpu_index);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
          polynomial_size, glwe_dimension);
@@ -298,7 +299,7 @@ __host__ void host_programmable_bootstrap_amortized(
    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t max_shared_memory, uint32_t gpu_offset) {

  cudaSetDevice(gpu_index);
  uint64_t SM_FULL =
@@ -332,14 +333,14 @@ __host__ void host_programmable_bootstrap_amortized(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
            glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_FULL);
+            level_count, lwe_idx, DM_FULL, gpu_offset);
  } else if (max_shared_memory < SM_FULL) {
    device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
        <<<grid, thds, SM_PART, stream>>>(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
            glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_PART);
+            level_count, lwe_idx, DM_PART, gpu_offset);
  } else {
    // For devices with compute capability 7.x a single thread block can
    // address the full capacity of shared memory. Shared memory on the
@@ -351,7 +352,7 @@ __host__ void host_programmable_bootstrap_amortized(
            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
            glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, 0);
+            level_count, lwe_idx, 0, gpu_offset);
  }
  check_cuda_error(cudaGetLastError());
 }
--- a/Show More
+++ b/Show More