chore(gpu): use the same parameters as the cpu

chore(ci): reduce the number of cpu threads used in tests on big instances
feat(gpu): unsigned scalar div
2026-01-11 15:48:20 -05:00 · 2024-06-14 09:23:44 +02:00 · 2024-06-13 21:22:29 +02:00 · 2024-06-13 21:22:29 +02:00 · 2024-06-13 17:38:46 +02:00 · 2024-06-13 17:38:46 +02:00
523 changed files with 35993 additions and 35237 deletions
--- a/.github/workflows/approve_label.yml
+++ b/.github/workflows/approve_label.yml
@@ -1,5 +1,5 @@
-# Manage approved label in pull request
-name: PR approved label manager
+# Add labels in pull request
+name: PR label manager

 on:
  pull_request:
@@ -27,7 +27,9 @@ jobs:
      # Add label only if the review is approved and if the label doesn't already exist
      - name: Add approved label
        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
-        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
+        if: ${{ github.event_name == 'pull_request_review' 
+          && github.event.review.state == 'approved'
+          && !contains(fromJSON(env.LABELS), 'approved') }}
        with:
          # We need to use a PAT to be able to trigger `labeled` event for the other workflow.
          github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -18,41 +18,42 @@ on:
  pull_request:

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (fast-tests)
+  setup-instance:
+    name: Setup instance (fast-tests)
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: cpu-big

  fast-tests:
    name: Fast CPU tests
-    needs: setup-ec2
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -116,22 +117,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (fast-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, fast-tests ]
+  teardown-instance:
+    name: Teardown instance (fast-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, fast-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -139,4 +139,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -16,7 +16,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [labeled]
+    types: [ labeled ]

 jobs:
  cuda-tests-linux:
@@ -29,10 +29,12 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -18,31 +18,30 @@ on:
  pull_request:

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cuda-tests)
+  setup-instance:
+    name: Setup instance (cuda-tests)
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: gpu-test

-  cuda-tests-linux:
-    name: CUDA tests
-    needs: setup-ec2
+  cuda-pcc:
+    name: CUDA post-commit checks
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -56,14 +55,16 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -94,6 +95,66 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  cuda-tests-linux:
+    name: CUDA tests
+    needs: [ setup-instance, cuda-pcc ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu
@@ -118,22 +179,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (cuda-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cuda-tests-linux ]
+  teardown-instance:
+    name: Teardown instance (cuda-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-pcc, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -141,4 +201,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -18,42 +18,43 @@ on:
    types: [ labeled ]

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (unsigned-integer-tests)
+  setup-instance:
+    name: Setup instance (unsigned-integer-tests)
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: cpu-big

  unsigned-integer-tests:
    name: Unsigned integer tests
-    needs: setup-ec2
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -81,22 +82,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (unsigned-integer-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, unsigned-integer-tests ]
+  teardown-instance:
+    name: Teardown instance (unsigned-integer-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, unsigned-integer-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -104,4 +104,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_multi_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_multi_gpu_tests.yml
@@ -0,0 +1,134 @@
+# Compile and test tfhe-cuda-backend on an AWS instance
+name: TFHE Cuda Backend - Full tests multi-GPU
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-tests-multi-gpu)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: multi-gpu-test
+
+  cuda-tests-linux:
+    name: CUDA multi-GPU tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@44c2b7a8a4ea60a981eaca3cf939b5f4305c123b
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CUDA AWS multi-GPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-tests-multi-gpu)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -18,42 +18,43 @@ on:
    types: [ labeled ]

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (signed-integer-tests)
+  setup-instance:
+    name: Setup instance (signed-integer-tests)
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: cpu-big

  signed-integer-tests:
    name: Signed integer tests
-    needs: setup-ec2
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -85,22 +86,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (signed-integer-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, signed-integer-tests ]
+  teardown-instance:
+    name: Teardown instance (signed-integer-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, signed-integer-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -108,4 +108,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -10,95 +10,211 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
    types: [ labeled ]
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cpu-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+  should-run:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    permissions:
+      pull-requests: write
+    outputs:
+      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
+      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.core_crypto_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      boolean_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.boolean_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.shortint_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.high_level_api_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      c_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.c_api_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      examples_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.examples_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      apps_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.apps_any_changed || steps.changed-files.outputs.dependencies_any_changed }}
+      user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.user_docs_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            dependencies:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+            csprng:
+              - concrete-csprng/**
+            zk_pok:
+              - tfhe-zk-pok/**
+            core_crypto:
+              - tfhe/src/core_crypto/**
+            boolean:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/boolean/**
+            shortint:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+            high_level_api:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+            c_api:
+              - tfhe/src/**
+            examples:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - tfhe/examples/**
+            apps:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - apps/trivium/src/**
+            user_docs:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - 'tfhe/docs/**.md'
+              - README.md
+
+      - name: Aggregate file changes
+        id: aggregated-changes
+        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
+          steps.changed-files.outputs.csprng_any_changed == 'true' ||
+          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
+          steps.changed-files.outputs.boolean_any_changed == 'true' ||
+          steps.changed-files.outputs.shortint_any_changed == 'true' ||
+          steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
+          steps.changed-files.outputs.c_api_any_changed == 'true' ||
+          steps.changed-files.outputs.examples_any_changed == 'true' ||
+          steps.changed-files.outputs.apps_any_changed == 'true' ||
+          steps.changed-files.outputs.user_docs_any_changed == 'true')
+        run: |
+          echo "any_changed=true" >> "$GITHUB_OUTPUT"
+
+  setup-instance:
+    name: Setup instance (cpu-tests)
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: cpu-big

  cpu-tests:
    name: CPU tests
-    needs: setup-ec2
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    needs: [ should-run, setup-instance ]
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

      - name: Run concrete-csprng tests
+        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
          make test_concrete_csprng

      - name: Run tfhe-zk-pok tests
+        if: needs.should-run.outputs.zk_pok_test == 'true'
        run: |
          make test_zk_pok

      - name: Run core tests
+        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
          AVX512_SUPPORT=ON make test_core_crypto

      - name: Run boolean tests
+        if: needs.should-run.outputs.boolean_test == 'true'
        run: |
          make test_boolean

      - name: Run C API tests
+        if: needs.should-run.outputs.c_api_test == 'true'
        run: |
          make test_c_api

      - name: Run user docs tests
+        if: needs.should-run.outputs.user_docs_test == 'true'
        run: |
          make test_user_doc

      - name: Gen Keys if required
+        if: needs.should-run.outputs.shortint_test == 'true'
        run: |
          make gen_key_cache

      - name: Run shortint tests
+        if: needs.should-run.outputs.shortint_test == 'true'
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_shortint_ci

      - name: Run high-level API tests
+        if: needs.should-run.outputs.high_level_api_test == 'true'
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_high_level_api

      - name: Run example tests
+        if: needs.should-run.outputs.examples_test == 'true'
        run: |
          make test_examples
          make dark_market

      - name: Run apps tests
+        if: needs.should-run.outputs.apps_test == 'true'
        run: |
          make test_trivium
          make test_kreyvium
@@ -111,22 +227,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (cpu-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cpu-tests ]
+  teardown-instance:
+    name: Teardown instance (cpu-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cpu-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -134,4 +249,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -18,42 +18,43 @@ on:
    types: [ labeled ]

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (wasm-tests)
+  setup-instance:
+    name: Setup instance (wasm-tests)
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: cpu-small

  wasm-tests:
    name: WASM tests
-    needs: setup-ec2
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -81,22 +82,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (wasm-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, wasm-tests ]
+  teardown-instance:
+    name: Teardown instance (wasm-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, wasm-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -104,4 +104,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -97,13 +97,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -23,7 +23,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29

      - name: Install and run newline linter checks
        if: matrix.os == 'ubuntu-latest'
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -10,7 +10,7 @@ jobs:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
        with:
-          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\(\w+\))?\:) .+$'
+          pattern: '^((feat|fix|chore|refactor|style|test|docs|doc)(\([\w\-_]+\))?\:) .+$'
          flags: "gs"
          error: 'Your first line has to contain a commit type and scope like "feat(my_feature): msg".'
          excludeDescription: "true" # optional: this excludes the description body of a pull request
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29

      - name: Get actionlint
        run: |
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -6,70 +6,58 @@ env:
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string
+  # Code coverage workflow is only run via workflow_dispatch event since execution duration is not stabilized yet.

 jobs:
-  code-coverage:
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
-    timeout-minutes: 11520 # 8 days
+  setup-instance:
+    name: Setup instance (code-coverage)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  code-coverage:
+    name: Code coverage tests
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.event_name }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 5760 # 4 days
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
+        uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
        with:
          files_yaml: |
            tfhe:
@@ -99,7 +87,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
+        uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -113,7 +101,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
+        uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -127,8 +115,28 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (code-coverage)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, code-coverage ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -63,13 +63,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

      - name: Run benchmarks with AVX512
        run: |
          make bench_pbs
+          make bench_pbs128
          make bench_ks

      - name: Parse results
@@ -88,13 +89,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -1,43 +1,45 @@
-# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+# Run core crypto benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
 name: Core crypto GPU benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-core-crypto-benchmarks:
-    name: Execute GPU core crypto benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  setup-instance:
+    name: Setup instance (cuda-core-crypto-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-core-crypto-benchmarks:
+    name: Execute GPU core crypto benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -45,33 +47,43 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
    steps:
-      - name: Instance configuration used
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -103,28 +115,26 @@ jobs:

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "n3-H100x1" \
          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --name-suffix avx512 \
          --walk-subdirs \
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -144,14 +154,39 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-core-crypto-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-core-crypto-benchmarks.result }}
+          SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ needs.cuda-core-crypto-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Instance teardown (cuda-core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -17,44 +17,44 @@ on:
  pull_request:
    types: [ labeled ]

-
 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (csprng-randomness-tests)
+  setup-instance:
+    name: Setup instance (csprng-randomness-tests)
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: cpu-small

  csprng-randomness-tests:
    name: CSPRNG randomness tests
-    needs: setup-ec2
+    needs: setup-instance
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -70,22 +70,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (csprng-randomness-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, csprng-randomness-tests ]
+  teardown-instance:
+    name: Teardown instance (csprng-randomness-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, csprng-randomness-tests ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -93,4 +92,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -39,7 +39,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -52,12 +52,12 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -81,7 +81,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -120,7 +120,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -133,12 +133,12 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -163,7 +163,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -194,7 +194,7 @@ jobs:
    name: Remove 4090 bench label
    if: ${{ always() && github.event_name == 'pull_request' }}
    needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
-    runs-on: ["self-hosted", "4090-desktop"]
+    runs-on: ubuntu-latest
    steps:
      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
        with:
--- a/.github/workflows/hyperstack_tfhe_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_gpu_tests.yml
@@ -0,0 +1,160 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Full tests on H100
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-h100-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-tests-linux:
+    name: CUDA H100 tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo \
+          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+           $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-h100-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -74,7 +74,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -92,12 +92,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -121,7 +121,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -1,24 +1,11 @@
-# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
 name: Integer GPU benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
+  push:
+    branches:
+      - main

 env:
  CARGO_TERM_COLOR: always
@@ -27,12 +14,35 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-integer-benchmarks:
-    name: Execute integer benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  setup-instance:
+    name: Setup instance (cuda-integer-benchmarks)
+    runs-on: ubuntu-latest
+    if:  github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-integer-benchmarks:
+    name: Execute GPU integer benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -40,33 +50,43 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
    steps:
-      - name: Instance configuration used
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -100,35 +120,33 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "n3-H100x1" \
          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -148,14 +166,39 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
      - name: Slack Notification
-        if: ${{ !success() && !cancelled() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -1,32 +1,11 @@
-# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
 name: Integer GPU full benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
@@ -34,13 +13,36 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  integer-benchmarks:
-    name: Execute integer benchmarks for all operations flavor
-    runs-on: ${{ github.event.inputs.runner_name }}
+  setup-instance:
+    name: Setup instance (cuda-integer-full-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-integer-full-benchmarks:
+    name: Execute GPU integer benchmarks for all operations flavor
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
-    if: ${{ !cancelled() }}
    continue-on-error: true
    strategy:
      fail-fast: false
@@ -52,19 +54,25 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
    steps:
-      - name: Instance configuration used
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -82,7 +90,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -107,7 +115,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -121,7 +129,7 @@ jobs:
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "n3-H100x1" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
@@ -132,7 +140,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -151,19 +159,39 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

-  slack-notification:
+  slack-notify:
    name: Slack Notification
-    runs-on: ${{ github.event.inputs.runner_name }}
+    needs: [ setup-instance, cuda-integer-full-benchmarks ]
+    runs-on: ubuntu-latest
    if: ${{ !success() && !cancelled() }}
-    needs: integer-benchmarks
+    continue-on-error: true
    steps:
-      - name: Notify
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-full-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-full-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -1,24 +1,11 @@
-# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
+# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
 name: Integer GPU Multi-bit benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
@@ -27,13 +14,36 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  cuda-integer-benchmarks:
-    name: Execute integer multi-bit benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
+  setup-instance:
+    name: Setup instance (cuda-integer-multi-bit-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-integer-multi-bit-benchmarks:
+    name: Execute GPU integer multi-bit benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
-    if: ${{ !cancelled() }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -41,33 +51,43 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
    steps:
-      - name: Instance configuration used
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -101,35 +121,33 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "n3-H100x1" \
          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -149,14 +167,40 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
      - name: Slack Notification
-        if: ${{ !success() && !cancelled() }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -0,0 +1,181 @@
+# Run 64-bit multi-bit integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Integer multi GPU Multi-bit benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: multi-gpu-test
+
+  cuda-integer-multi-bit-multi-gpu-benchmarks:
+    name: Execute multi GPU integer multi-bit benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run multi-bit benchmarks with AVX512
+        run: |
+          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "p3.8xlarge" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_integer
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}
+          SLACK_MESSAGE: "Integer multi GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-multi-bit-multi-gpu-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_multi_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -0,0 +1,184 @@
+# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Integer multi GPU full benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-integer-full-multi-gpu-benchmarks)
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: multi-gpu-test
+
+  cuda-integer-full-multi-gpu-benchmarks:
+    name: Execute multi GPU integer benchmarks for all operations flavor
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        command: [integer, integer_multi_bit]
+        op_flavor: [default, unchecked]
+        # explicit include-based build matrix, of known valid options
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "p3.8xlarge" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-multi-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-multi-gpu-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-multi-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -3,7 +3,7 @@ name: Tests on M1 CPU
 on:
  workflow_dispatch:
  pull_request:
-    types: [labeled]
+    types: [ labeled ]
  # Have a nightly build for M1 tests
  schedule:
    # * is a special character in YAML so you have to quote this string
@@ -31,10 +31,12 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -86,6 +88,13 @@ jobs:
        run: |
          make test_boolean

+      # Because we do "illegal" things with the build system which Cargo does not seem to like much
+      # we need to clear the cache to make sure the C API is built properly and does not use a stale
+      # cached version
+      - name: Clear build cache
+        run: |
+          cargo clean
+
      - name: Run C API tests
        run: |
          make test_c_api
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -20,9 +20,14 @@ on:
        description: "Push node js package"
        type: boolean
        default: true
+      npm_latest_tag:
+        description: "Set NPM tag as latest"
+        type: boolean
+        default: false

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  NPM_TAG: ""

 jobs:
  publish_release:
@@ -30,10 +35,14 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

+      - name: Create NPM version tag
+        run: |
+          echo "NPM_TAG=$(sed -n -e '1,/^version/p' tfhe/Cargo.toml | grep '^version[[:space:]]*=' | cut -d '=' -f 2 | tr -d ' ')" >> "${GITHUB_ENV}"
+
      - name: Publish crate.io package
        if: ${{ inputs.push_to_crates }}
        env:
@@ -45,7 +54,7 @@ jobs:
      - name: Build web package
        if: ${{ inputs.push_web_package }}
        run: |
-          make build_web_js_api
+          make build_web_js_api_parallel

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
@@ -54,6 +63,15 @@ jobs:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}
+
+      - name: Publish web package as latest
+        if: ${{ inputs.push_web_package && inputs.npm_latest_tag }}
+        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
+        with:
+          token: ${{ secrets.NPM_TOKEN }}
+          package: tfhe/pkg/package.json
+          dry-run: ${{ inputs.dry_run }}

      - name: Build Node package
        if: ${{ inputs.push_node_package }}
@@ -70,6 +88,15 @@ jobs:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
          dry-run: ${{ inputs.dry_run }}
+          tag: ${{ env.NPM_TAG }}
+
+      - name: Publish Node package as latest
+        if: ${{ inputs.push_node_package && inputs.npm_latest_tag }}
+        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
+        with:
+          token: ${{ secrets.NPM_TOKEN }}
+          package: tfhe/pkg/package.json
+          dry-run: ${{ inputs.dry_run }}

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -21,28 +21,27 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (publish-cuda-release)
+  setup-instance:
+    name: Setup instance (publish-cuda-release)
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-      aws-region: ${{ steps.start-instance.outputs.aws-region }}
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
          profile: gpu-test

  publish-cuda-release:
    name: Publish CUDA Release
-    needs: setup-ec2
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -55,7 +54,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -64,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable

@@ -105,22 +104,21 @@ jobs:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

-  teardown-ec2:
-    name: Teardown EC2 instance (publish-release)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, publish-cuda-release ]
+  teardown-instance:
+    name: Teardown instance (publish-release)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, publish-cuda-release ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          region: ${{ needs.setup-ec2.outputs.aws-region }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
        if: ${{ failure() }}
@@ -128,4 +126,4 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -0,0 +1,42 @@
+# Publish new release of tfhe-zk-pok on crates.io.
+name: Publish tfhe-zk-pok release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  publish_release:
+    name: Publish tfhe-zk-pok Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          cargo publish -p tfhe-zk-pok --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-zk-pok release failed: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -17,10 +17,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29

      - name: Checkout lattice-estimator
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -45,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -55,7 +55,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -89,13 +89,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_shortint
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -71,12 +71,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -115,7 +115,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -52,7 +52,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -70,12 +70,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -99,7 +99,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -36,14 +36,6 @@ on:
        description: "Run core crypto benches"
        type: boolean
        default: true
-      core_crypto_gpu_bench:
-        description: "Run core crypto benches on GPU"
-        type: boolean
-        default: true
-      wasm_client_bench:
-        description: "Run WASM client benches"
-        type: boolean
-        default: true

 jobs:
  start-benchmarks:
@@ -53,18 +45,17 @@ jobs:
        command: [ boolean_bench, shortint_bench,
                   integer_bench, integer_multi_bit_bench,
                   signed_integer_bench, signed_integer_multi_bit_bench,
-                   integer_gpu_bench, integer_multi_bit_gpu_bench,
-                   core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
+                   core_crypto_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
+        uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
        with:
          files_yaml: |
            common_benches:
@@ -106,12 +97,9 @@ jobs:
              - tfhe/src/core_crypto/**
              - tfhe/benches/core_crypto/**
              - .github/workflows/core_crypto_benchmark.yml
-            wasm_client_bench:
-              - tfhe/web_wasm_parallel_tests/**
-              - .github/workflows/wasm_client_benchmark.yml

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -25,17 +25,17 @@ jobs:
    strategy:
      matrix:
        command: [ boolean_bench, shortint_full_bench,
-                   integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
-                   core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
+                   integer_full_bench, signed_integer_full_bench,
+                   core_crypto_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -13,14 +13,9 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0
-      - name: Save repo
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: repo-archive
-          path: '.'
      - name: git-sync
        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -1,32 +1,14 @@
-# Run WASM client benchmarks on an AWS instance and return parsed results to Slab CI bot.
+# Run WASM client benchmarks on an instance and return parsed results to Slab CI bot.
 name: WASM client benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
+  push:
+    branches:
+      - main
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
@@ -34,36 +16,88 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-wasm-client-benchmarks:
-    name: Execute WASM client benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  should-run:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'workflow_dispatch' ||
+      ((github.event_name == 'push' || github.event_name == 'schedule') && github.repository == 'zama-ai/tfhe-rs')
+    permissions:
+      pull-requests: write
+    outputs:
+      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            wasm_bench:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - tfhe/web_wasm_parallel_tests/**
+              - .github/workflows/wasm_client_benchmark.yml
+
+  setup-instance:
+    name: Setup instance (wasm-client-benchmarks)
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  wasm-client-benchmarks:
+    name: Execute WASM client benchmarks
+    needs: [ should-run, setup-instance ]
+    if: github.event_name != 'push' ||
+      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.wasm_bench)
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly

@@ -75,15 +109,12 @@ jobs:
      - name: Parse results
        run: |
          make parse_wasm_benchmarks
-
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py tfhe/wasm_pk_gen.csv ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
+          --hardware "m6i.4xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --key-gen

@@ -98,13 +129,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
@@ -130,8 +161,28 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "WASM benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (wasm-client-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, wasm-client-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@58f2cae4bf2c0b6728083f5f009b6dc0eb6dc3ac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/56
+++ b/56
@@ -160,6 +160,12 @@ check_nvm_installed:
 	@source ~/.nvm/nvm.sh && nvm --version > /dev/null 2>&1 || \
 	( echo "Unable to locate Node. Run 'make install_node'" && exit 1 )

+.PHONY: install_mlc # Install mlc (Markup Link Checker)
+install_mlc: install_rs_build_toolchain
+	@mlc --version > /dev/null 2>&1 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install mlc --locked || \
+	( echo "Unable to install mlc, unknown error." && exit 1 )
+
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
@@ -271,7 +277,7 @@ clippy_js_wasm_api: install_rs_check_toolchain
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
-clippy_tasks:
+clippy_tasks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		-p tasks -- --no-deps -D warnings

@@ -281,19 +287,19 @@ clippy_trivium: install_rs_check_toolchain
 		-p tfhe-trivium -- --no-deps -D warnings

 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
-clippy_all_targets:
+clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
-clippy_concrete_csprng:
+clippy_concrete_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE) \
 		-p concrete-csprng -- --no-deps -D warnings

 .PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
-clippy_zk_pok:
+clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok -- --no-deps -D warnings

@@ -376,7 +382,7 @@ build_c_api_gpu: install_rs_check_toolchain
 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

@@ -444,16 +450,16 @@ test_cuda_backend:
 test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
+test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
+test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

@@ -479,14 +485,14 @@ test_c_api_rs: install_rs_check_toolchain

 .PHONY: test_c_api_c # Run the C tests for the C API
 test_c_api_c: build_c_api
-	./scripts/c_api_tests.sh
+	./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"

 .PHONY: test_c_api # Run all the tests for the C API
 test_c_api: test_c_api_rs test_c_api_c

 .PHONY: test_c_api_gpu # Run the C tests for the C API
 test_c_api_gpu: build_c_api_gpu
-	./scripts/c_api_tests.sh --gpu
+	./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"

 .PHONY: test_shortint_ci # Run the tests for shortint ci
 test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
@@ -638,12 +644,12 @@ test_kreyvium: install_rs_build_toolchain
 		-p tfhe-trivium -- --test-threads=1 kreyvium::

 .PHONY: test_concrete_csprng # Run concrete-csprng tests
-test_concrete_csprng:
+test_concrete_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng

 .PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
-test_zk_pok:
+test_zk_pok: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok

@@ -682,15 +688,23 @@ format_doc_latex:
 check_md_docs_are_tested:
 	RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested

+.PHONY: check_intra_md_links # Checks broken internal links in Markdown docs
+check_intra_md_links: install_mlc
+	mlc --offline --match-file-extension tfhe/docs
+
+.PHONY: check_md_links # Checks all broken links in Markdown docs
+check_md_links: install_mlc
+	mlc --match-file-extension tfhe/docs
+
 .PHONY: check_compile_tests # Build tests in debug without running them
-check_compile_tests:
+check_compile_tests: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
 		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC)

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
 		"$(MAKE)" build_c_api && \
-		./scripts/c_api_tests.sh --build-only; \
+		./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
 	fi

 .PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
@@ -813,8 +827,6 @@ bench_oprf: install_rs_check_toolchain
 	--bench oprf-integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

-
-
 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -823,7 +835,6 @@ bench_shortint_multi_bit: install_rs_check_toolchain
 	--bench shortint-bench \
 	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

-
 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -836,6 +847,12 @@ bench_pbs: install_rs_check_toolchain
 	--bench pbs-bench \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

+.PHONY: bench_pbs128 # Run benchmarks for PBS using FFT 128 bits
+bench_pbs128: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs128-bench \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+
 .PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
 bench_pbs_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -867,6 +884,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
 #
 # Utility tools
 #
+
 .PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
 gen_key_cache: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
@@ -942,8 +960,8 @@ sha256_bool: install_rs_check_toolchain
 	--features=$(TARGET_ARCH_FEATURE),boolean

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
-pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_all \
-check_compile_tests
+pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
+clippy_all check_compile_tests

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ production-ready library for all the advanced features of TFHE.
 <br></br>

 ## Table of Contents
- **[Getting Started](#getting-started)**
+- **[Getting started](#getting-started)**
   - [Cargo.toml configuration](#cargotoml-configuration)
   - [A simple example](#a-simple-example)
 - **[Resources](#resources)**
@@ -65,7 +65,7 @@ production-ready library for all the advanced features of TFHE.
 - **[Support](#support)**
 <br></br>

-## Getting Started
+## Getting started

 ### Cargo.toml configuration
 To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
@@ -198,7 +198,7 @@ Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-

 ### Disclaimers

-#### Security Estimation
+#### Security estimation

 Security estimations are done using the
 [Lattice Estimator](https://github.com/malb/lattice-estimator)
@@ -206,13 +206,13 @@ with `red_cost_model = reduction.RC.BDGL16`.

 When a new update is published in the Lattice Estimator, we update parameters accordingly.

-### Security Model
+### Security model

 The default parameters for the TFHE-rs library are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at p_error = $2^{-40}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [1]. 

 [1] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf

-#### Side-Channel Attacks
+#### Side-channel attacks

 Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
 and will be released in upcoming versions.
@@ -241,7 +241,23 @@ Becoming an approved contributor involves signing our Contributor License Agreem
 <br></br>

 ### License
-This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
+This software is distributed under the **BSD-3-Clause-Clear** license. Read [this](LICENSE) for more details.
+
+#### FAQ
+**Is Zama’s technology free to use?**
+>Zama’s libraries are free to use under the BSD 3-Clause Clear license only for development, research, prototyping, and experimentation purposes. However, for any commercial use of Zama's open source code, companies must purchase Zama’s commercial patent license.
+>
+>Everything we do is open source and we are very transparent on what it means for our users, you can read more about how we monetize our open source products at Zama in [this blogpost](https://www.zama.ai/post/open-source).
+
+**What do I need to do if I want to use Zama’s technology for commercial purposes?**
+>To commercially use Zama’s technology you need to be granted Zama’s patent license. Please contact us hello@zama.ai for more information.
+
+**Do you file IP on your technology?**
+>Yes, all Zama’s technologies are patented.
+
+**Can you customize a solution for my specific use case?**
+>We are open to collaborating and advancing the FHE space with our partners. If you have specific needs, please email us at hello@zama.ai.
+
 <p align="right">
  <a href="#about" > ↑ Back to top </a> 
 </p>
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -71,7 +71,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
 }

 fn main() {
-	let config = ConfigBuilder::all_disabled().enable_default_bool().build();
+	let config = ConfigBuilder::default().build();
 	let (client_key, server_key) = generate_keys(config);

 	let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -143,7 +143,7 @@ use tfhe::prelude::*;
 use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
-	let config = ConfigBuilder::all_disabled().enable_default_integers().build();
+	let config = ConfigBuilder::default().build();
 	let (hl_client_key, hl_server_key) = generate_keys(config);
 	let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
 	let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -13,6 +13,4 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

 [build-dependencies]
 cmake = { version = "0.1" }
-
-[dependencies]
-thiserror = "1.0"
+pkg-config = { version = "0.3" }
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -9,6 +9,11 @@ fn main() {
    }

    println!("Build tfhe-cuda-backend");
+    println!("cargo::rerun-if-changed=cuda/include");
+    println!("cargo::rerun-if-changed=cuda/src");
+    println!("cargo::rerun-if-changed=cuda/tests_and_benchmarks");
+    println!("cargo::rerun-if-changed=cuda/CMakeLists.txt");
+    println!("cargo::rerun-if-changed=src");
    if env::consts::OS == "linux" {
        let output = Command::new("./get_os_name.sh").output().unwrap();
        let distribution = String::from_utf8(output.stdout).unwrap();
@@ -21,7 +26,15 @@ fn main() {
        let dest = cmake::build("cuda");
        println!("cargo:rustc-link-search=native={}", dest.display());
        println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
-        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+
+        // Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
+        if pkg_config::Config::new()
+            .atleast_version("10")
+            .probe("cuda")
+            .is_err()
+        {
+            println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        }
        println!("cargo:rustc-link-lib=gomp");
        println!("cargo:rustc-link-lib=cudart");
        println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
--- a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
+++ b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -4,14 +4,14 @@
 #include <cstdint>

 extern "C" {
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  void *v_stream,
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
                                                  uint32_t number_of_cts,
                                                  uint32_t lwe_dimension);
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  void *v_stream,
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
                                                  uint32_t number_of_cts,
                                                  uint32_t lwe_dimension);
 };
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -6,9 +6,9 @@
 #include <cstdlib>
 #include <cstring>
 #include <cuda_runtime.h>
+#include <vector>

 #define synchronize_threads_in_block() __syncthreads()
-
 extern "C" {

 #define check_cuda_error(ans)                                                  \
@@ -27,47 +27,33 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

-struct cuda_stream_t {
-  cudaStream_t stream;
-  uint32_t gpu_index;
+cudaStream_t cuda_create_stream(uint32_t gpu_index);

-  cuda_stream_t(uint32_t gpu_index) {
-    this->gpu_index = gpu_index;
+void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);

-    check_cuda_error(cudaStreamCreate(&stream));
-  }
-
-  void release() {
-    check_cuda_error(cudaSetDevice(gpu_index));
-    check_cuda_error(cudaStreamDestroy(stream));
-  }
-
-  void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
-};
-
-cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
-
-void cuda_destroy_stream(cuda_stream_t *stream);
+void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);

 void *cuda_malloc(uint64_t size, uint32_t gpu_index);

-void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
+void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

 bool cuda_check_support_cooperative_groups();

+bool cuda_check_support_thread_block_clusters();
+
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                              cuda_stream_t *stream);
+                              cudaStream_t stream, uint32_t gpu_index);

 void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                  cuda_stream_t *stream);
+                                  cudaStream_t stream, uint32_t gpu_index);

 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                              cuda_stream_t *stream);
+                              cudaStream_t stream, uint32_t gpu_index);

 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                       cuda_stream_t *stream);
+                       cudaStream_t stream, uint32_t gpu_index);

 int cuda_get_number_of_gpus();

@@ -75,20 +61,18 @@ void cuda_synchronize_device(uint32_t gpu_index);

 void cuda_drop(void *ptr, uint32_t gpu_index);

-void cuda_drop_async(void *ptr, cuda_stream_t *stream);
+void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);

 int cuda_get_max_shared_memory(uint32_t gpu_index);

-void cuda_synchronize_stream(cuda_stream_t *stream);
-
-void cuda_stream_add_callback(cuda_stream_t *stream,
+void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
                              cudaStreamCallback_t callback, void *user_data);
+}

 void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
                                  void *host_pointer);
-}

 template <typename Torus>
-void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
-                          Torus n);
+void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                          Torus *d_array, Torus value, Torus n);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -0,0 +1,18 @@
+#ifndef HELPER_MULTI_GPU_H
+#define HELPER_MULTI_GPU_H
+#include <mutex>
+
+extern std::mutex m;
+extern bool p2p_enabled;
+
+extern "C" {
+int cuda_setup_multi_gpu();
+}
+
+int get_active_gpu_count(int num_inputs, int gpu_count);
+
+int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
+
+int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -6,16 +6,18 @@
 extern "C" {

 void cuda_keyswitch_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset = 0);

 void cuda_keyswitch_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset = 0);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -7,42 +7,42 @@

 extern "C" {

-void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count);
-void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
                                       void *lwe_array_in_1,
                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
                                       void *lwe_array_in_1,
                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -5,80 +5,89 @@
 #include <cstdint>

 enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
-enum PBS_VARIANT { DEFAULT = 0, CG = 1 };
+enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };

 extern "C" {
-void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
-                                 cuda_stream_t *stream,
+void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
+                                 void *input1, void *input2, void *output,
                                 uint32_t polynomial_size,
                                 uint32_t total_polynomials);

 void cuda_convert_lwe_programmable_bootstrap_key_32(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);

 void cuda_convert_lwe_programmable_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);

 void scratch_cuda_programmable_bootstrap_amortized_32(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);

 void scratch_cuda_programmable_bootstrap_amortized_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);

 void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory);

 void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t gpu_offset = 0);

-void cleanup_cuda_programmable_bootstrap_amortized(cuda_stream_t *stream,
+void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
+                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);

 void scratch_cuda_programmable_bootstrap_32(
-    cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);

 void scratch_cuda_programmable_bootstrap_64(
-    cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t gpu_offset = 0);

-void cleanup_cuda_programmable_bootstrap(cuda_stream_t *stream,
+void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                         int8_t **pbs_buffer);

 uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
@@ -111,6 +120,28 @@ get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft
 }

+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_programmable_bootstrap_tbc(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // tbc
+}
+
 template <typename Torus>
 __host__ __device__ uint64_t
 get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
@@ -125,6 +156,11 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
 }

+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_classic_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory);
+
 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
@@ -135,13 +171,14 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

  PBS_VARIANT pbs_variant;

-  pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
+  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
             bool allocate_gpu_memory) {
+
    this->pbs_variant = pbs_variant;

-    auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

    if (allocate_gpu_memory) {
      switch (pbs_variant) {
@@ -173,17 +210,17 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
                       level_count * (glwe_dimension + 1);
        }
        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

        global_accumulator_fft = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                (polynomial_size / 2) * sizeof(double2),
-            stream);
+            stream, gpu_index);

        global_accumulator = (Torus *)cuda_malloc_async(
            (glwe_dimension + 1) * input_lwe_ciphertext_count *
                polynomial_size * sizeof(Torus),
-            stream);
+            stream, gpu_index);
      } break;
      case PBS_VARIANT::CG: {
        uint64_t full_sm =
@@ -206,25 +243,73 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
        }

        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);

        global_accumulator_fft = (double2 *)cuda_malloc_async(
            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
                polynomial_size / 2 * sizeof(double2),
-            stream);
+            stream, gpu_index);
      } break;
+#if CUDA_ARCH >= 900
+      case PBS_VARIANT::TBC: {
+
+        bool supports_dsm =
+            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+                Torus>(polynomial_size, max_shared_memory);
+
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+                polynomial_size);
+        uint64_t minimum_sm_tbc = 0;
+        if (supports_dsm)
+          minimum_sm_tbc =
+              get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
+                  Torus>(polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
+        // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+        if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                polynomial_size / 2 * sizeof(double2),
+            stream, gpu_index);
+      } break;
+#endif
      default:
        PANIC("Cuda error (PBS): unsupported implementation variant.")
      }
    }
  }

-  void release(cuda_stream_t *stream) {
-    cuda_drop_async(d_mem, stream);
-    cuda_drop_async(global_accumulator_fft, stream);
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+    cuda_drop_async(d_mem, stream, gpu_index);
+    cuda_drop_async(global_accumulator_fft, stream, gpu_index);

    if (pbs_variant == DEFAULT)
-      cuda_drop_async(global_accumulator, stream);
+      cuda_drop_async(global_accumulator, stream, gpu_index);
  }
 };

@@ -263,38 +348,64 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,

 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);

 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+
+#if (CUDA_ARCH >= 900)
+template <typename Torus>
+void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_programmable_bootstrap_tbc(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+#endif

 template <typename Torus, typename STorus>
 void scratch_cuda_programmable_bootstrap_cg(
-    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);

 template <typename Torus, typename STorus>
 void scratch_cuda_programmable_bootstrap(
-    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);

+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
+                                                    uint32_t glwe_dimension,
+                                                    uint32_t polynomial_size,
+                                                    uint32_t level_count,
+                                                    uint32_t max_shared_memory);
+
 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
                                         int glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -11,68 +11,92 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t num_samples, uint32_t max_shared_memory);

 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
-    uint32_t grouping_factor);
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor);

 void scratch_cuda_multi_bit_programmable_bootstrap_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t chunk_size = 0);
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t chunk_size = 0);

 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
-
-void scratch_cuda_generic_multi_bit_programmable_bootstrap_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
    uint32_t lwe_chunk_size = 0);

-void cuda_generic_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
+void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
+                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);
 }

+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory);
+
+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory);
+
+#if CUDA_ARCH >= 900
+template <typename Torus, typename STorus>
+void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+
+template <typename Torus>
+void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+#endif
+
 template <typename Torus, typename STorus>
 void scratch_cuda_cg_multi_bit_programmable_bootstrap(
-    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);

+template <typename Torus, typename STorus>
+void scratch_cuda_cg_multi_bit_programmable_bootstrap(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+
 template <typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);

 template <typename Torus, typename STorus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
-    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t grouping_factor,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
@@ -80,14 +104,14 @@ void scratch_cuda_multi_bit_programmable_bootstrap(

 template <typename Torus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);

 template <typename Torus>
 __host__ __device__ uint64_t
@@ -113,12 +137,25 @@ template <typename Torus>
 __host__ __device__ uint64_t
 get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);

 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
  int8_t *d_mem_acc_step_two = NULL;
  int8_t *d_mem_acc_cg = NULL;
+  int8_t *d_mem_acc_tbc = NULL;

  double2 *keybundle_fft;
  Torus *global_accumulator;
@@ -126,25 +163,27 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {

  PBS_VARIANT pbs_variant;

-  pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
+  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
    this->pbs_variant = pbs_variant;
-    auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

+    // default
    uint64_t full_sm_keybundle =
        get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
            Torus>(polynomial_size);
    uint64_t full_sm_accumulate_step_one =
        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
            polynomial_size);
-    uint64_t partial_sm_accumulate_step_one =
-        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
-            Torus>(polynomial_size);
    uint64_t full_sm_accumulate_step_two =
        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
            polynomial_size);
+    uint64_t partial_sm_accumulate_step_one =
+        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+            Torus>(polynomial_size);
+    // cg
    uint64_t full_sm_cg_accumulate =
        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
            polynomial_size);
@@ -162,80 +201,124 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
    auto num_blocks_acc_cg =
        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;

+#if CUDA_ARCH >= 900
+    uint64_t full_sm_tbc_accumulate =
+        get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    uint64_t partial_sm_tbc_accumulate =
+        get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    uint64_t minimum_sm_tbc =
+        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    auto num_blocks_acc_tbc = num_blocks_acc_cg;
+#endif
+
    if (allocate_gpu_memory) {
      // Keybundle
      if (max_shared_memory < full_sm_keybundle)
        d_mem_keybundle = (int8_t *)cuda_malloc_async(
-            num_blocks_keybundle * full_sm_keybundle, stream);
+            num_blocks_keybundle * full_sm_keybundle, stream, gpu_index);

      switch (pbs_variant) {
-      case DEFAULT:
+      case PBS_VARIANT::CG:
+        // Accumulator CG
+        if (max_shared_memory < partial_sm_cg_accumulate)
+          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index);
+        else if (max_shared_memory < full_sm_cg_accumulate)
+          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index);
+        break;
+      case PBS_VARIANT::DEFAULT:
        // Accumulator step one
        if (max_shared_memory < partial_sm_accumulate_step_one)
          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_one * full_sm_accumulate_step_one, stream);
+              num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+              gpu_index);
        else if (max_shared_memory < full_sm_accumulate_step_one)
          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream);
+              num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+              gpu_index);

        // Accumulator step two
        if (max_shared_memory < full_sm_accumulate_step_two)
          d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_two * full_sm_accumulate_step_two, stream);
+              num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+              gpu_index);
        break;
-      case CG:
-        // Accumulator CG
-        if (max_shared_memory < partial_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * full_sm_cg_accumulate, stream);
-        else if (max_shared_memory < full_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
+#if CUDA_ARCH >= 900
+      case TBC:
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
+        // partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
+        // will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+
+        // Accumulator TBC
+        if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
+          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index);
+        else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
+          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream,
+              gpu_index);
        break;
+#endif
      default:
        PANIC("Cuda error (PBS): unsupported implementation variant.")
      }

      keybundle_fft = (double2 *)cuda_malloc_async(
          num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
-          stream);
+          stream, gpu_index);
      global_accumulator = (Torus *)cuda_malloc_async(
-          num_blocks_acc_step_two * polynomial_size * sizeof(Torus), stream);
+          num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
+          gpu_index);
      global_accumulator_fft = (double2 *)cuda_malloc_async(
          num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
-          stream);
+          stream, gpu_index);
    }
  }

-  void release(cuda_stream_t *stream) {
+  void release(cudaStream_t stream, uint32_t gpu_index) {

    if (d_mem_keybundle)
-      cuda_drop_async(d_mem_keybundle, stream);
+      cuda_drop_async(d_mem_keybundle, stream, gpu_index);
    switch (pbs_variant) {
    case DEFAULT:
      if (d_mem_acc_step_one)
-        cuda_drop_async(d_mem_acc_step_one, stream);
+        cuda_drop_async(d_mem_acc_step_one, stream, gpu_index);
      if (d_mem_acc_step_two)
-        cuda_drop_async(d_mem_acc_step_two, stream);
+        cuda_drop_async(d_mem_acc_step_two, stream, gpu_index);
      break;
    case CG:
      if (d_mem_acc_cg)
-        cuda_drop_async(d_mem_acc_cg, stream);
+        cuda_drop_async(d_mem_acc_cg, stream, gpu_index);
      break;
+#if CUDA_ARCH >= 900
+    case TBC:
+      if (d_mem_acc_tbc)
+        cuda_drop_async(d_mem_acc_tbc, stream, gpu_index);
+      break;
+#endif
    default:
      PANIC("Cuda error (PBS): unsupported implementation variant.")
    }

-    cuda_drop_async(keybundle_fft, stream);
-    cuda_drop_async(global_accumulator, stream);
-    cuda_drop_async(global_accumulator_fft, stream);
+    cuda_drop_async(keybundle_fft, stream, gpu_index);
+    cuda_drop_async(global_accumulator, stream, gpu_index);
+    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
  }
 };

-#ifdef __CUDACC__
-
-__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count);
-
-#endif
+template <typename Torus, class params>
+__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
+                                     uint32_t polynomial_size,
+                                     uint32_t max_shared_memory);

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -10,7 +10,8 @@ set(SOURCES
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1 +1,21 @@
 #include "ciphertext.cuh"
+
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
+                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension) {
+  cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
+      (uint64_t *)src, number_of_cts, lwe_dimension);
+}
+
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
+                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension) {
+  cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
+      (uint64_t *)src, number_of_cts, lwe_dimension);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -6,39 +6,23 @@
 #include <cstdint>

 template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
-                                               cuda_stream_t *stream,
-                                               uint32_t number_of_cts,
+void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
+                                               uint32_t gpu_index, T *dest,
+                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_gpu(dest, src, size, stream);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  cuda_stream_t *stream,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
+  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
 }

 template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
-                                               cuda_stream_t *stream,
-                                               uint32_t number_of_cts,
+void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
+                                               uint32_t gpu_index, T *dest,
+                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_cpu(dest, src, size, stream);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  cuda_stream_t *stream,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
+  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -20,9 +20,7 @@ private:
  uint32_t level_count;
  uint32_t base_log;
  uint32_t mask;
-  uint32_t halfbg;
  uint32_t num_poly;
-  T offset;
  int current_level;
  T mask_mod_b;
  T *state;
@@ -82,72 +80,12 @@ public:
    synchronize_threads_in_block();
  }

-  // Decomposes a single polynomial
-  __device__ void
-  decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
-    if (j == 0)
-      current_level -= 1;
-
-    int tid = threadIdx.x;
-    auto state_slice = state + j * params::degree;
-    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state_slice[tid] & mask_mod_b;
-      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
-      state_slice[tid] >>= base_log;
-      state_slice[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
-      T carry_im =
-          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
-      carry_re >>= (base_log - 1);
-      carry_im >>= (base_log - 1);
-      state_slice[tid] += carry_re;
-      state_slice[tid + params::degree / 2] += carry_im;
-      res_re -= carry_re << base_log;
-      res_im -= carry_im << base_log;
-
-      result[i].x = (int32_t)res_re;
-      result[i].y = (int32_t)res_im;
-
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
  __device__ void decompose_and_compress_level(double2 *result, int level) {
    for (int i = 0; i < level_count - level; i++)
      decompose_and_compress_next(result);
  }
 };

-template <typename T> class GadgetMatrixSingle {
-private:
-  uint32_t level_count;
-  uint32_t base_log;
-  uint32_t mask;
-  uint32_t halfbg;
-  T offset;
-
-public:
-  __device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
-      : base_log(base_log), level_count(level_count) {
-    uint32_t bg = 1 << base_log;
-    this->halfbg = bg / 2;
-    this->mask = bg - 1;
-    T temp = 0;
-    for (int i = 0; i < this->level_count; i++) {
-      temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
-    }
-    this->offset = temp * this->halfbg;
-  }
-
-  __device__ T decompose_one_level_single(T element, uint32_t level) {
-    T s = element + this->offset;
-    uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
-    T temp1 = (s >> decal) & this->mask;
-    return (T)(temp1 - this->halfbg);
-  }
-};
-
 template <typename Torus>
 __device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
  Torus res = state & mask_mod_b;
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -49,11 +49,15 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
 * global memory
 */
 template <typename T, typename ST, class params>
-void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
+void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
+                           uint32_t gpu_count, double2 *dest, T *src,
                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
                           uint32_t polynomial_size, uint32_t level_count,
-                           uint32_t gpu_index, uint32_t max_shared_memory) {
-  cudaSetDevice(stream->gpu_index);
+                           uint32_t max_shared_memory) {
+  if (gpu_count != 1)
+    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
+          "supported yet.")
+  cudaSetDevice(gpu_indexes[0]);

  int shared_memory_size = sizeof(double) * polynomial_size;

@@ -62,11 +66,11 @@ void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,

  if (max_shared_memory < shared_memory_size) {
    device_batch_fft_ggsw_vector<T, ST, params, NOSM>
-        <<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
+        <<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
  } else {
    device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
-        <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
-                                                                      d_mem);
+        <<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
+                                                                  d_mem);
  }
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -6,16 +6,19 @@
 * Head out to the equivalent operation on 64 bits for more details.
 */
 void cuda_keyswitch_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset) {
  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, static_cast<uint32_t *>(lwe_array_out),
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
      static_cast<uint32_t *>(lwe_array_in),
      static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
+      gpu_offset);
 }

 /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -35,14 +38,17 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
 * 	- num_samples blocks of threads are launched
 */
 void cuda_keyswitch_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset) {
  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_output_indexes),
      static_cast<uint64_t *>(lwe_array_in),
      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
+      gpu_offset);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -3,8 +3,11 @@

 #include "device.h"
 #include "gadget.cuh"
+#include "helper_multi_gpu.h"
+#include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>

@@ -31,110 +34,128 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 * scaling factor) under key s2 instead of s1, with an increased noise
 *
 */
+// Each thread in x are used to calculate one output.
+// threads in y are used to paralelize the lwe_dimension_in loop.
+// shared memory is used to store intermediate results of the reduction.
 template <typename Torus>
-__global__ void
-keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
-          Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
-          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-          int lwe_lower, int lwe_upper, int cutoff) {
-  int tid = threadIdx.x;
+__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes,
+                          Torus *lwe_array_in, Torus *lwe_input_indexes,
+                          Torus *ksk, uint32_t lwe_dimension_in,
+                          uint32_t lwe_dimension_out, uint32_t base_log,
+                          uint32_t level_count, int gpu_offset) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;

  extern __shared__ int8_t sharedmem[];
+  Torus *lwe_acc_out = (Torus *)sharedmem;
+  auto block_lwe_array_out =
+      get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
+                lwe_dimension_out + 1);

-  Torus *local_lwe_array_out = (Torus *)sharedmem;
+  if (tid <= lwe_dimension_out) {

-  auto block_lwe_array_in = get_chunk(
-      lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
-  auto block_lwe_array_out = get_chunk(
-      lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
+    Torus local_lwe_out = 0;
+    auto block_lwe_array_in =
+        get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
+                  lwe_dimension_in + 1);

-  auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
+    if (tid == lwe_dimension_out && threadIdx.y == 0) {
+      local_lwe_out = block_lwe_array_in[lwe_dimension_in];
+    }
+    const Torus mask_mod_b = (1ll << base_log) - 1ll;

-  int lwe_part_per_thd;
-  if (tid < cutoff) {
-    lwe_part_per_thd = lwe_upper;
-  } else {
-    lwe_part_per_thd = lwe_lower;
-  }
-  __syncthreads();
+    const int pack_size = (lwe_dimension_in + blockDim.y - 1) / blockDim.y;
+    const int start_i = pack_size * threadIdx.y;
+    const int end_i = SEL(lwe_dimension_in, pack_size * (threadIdx.y + 1),
+                          pack_size * (threadIdx.y + 1) <= lwe_dimension_in);

-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    local_lwe_array_out[idx] = 0;
-  }
-  __syncthreads();
+    // This loop distribution seems to benefit the global mem reads
+    for (int i = start_i; i < end_i; i++) {
+      Torus a_i = round_to_closest_multiple(block_lwe_array_in[i], base_log,
+                                            level_count);
+      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);

-  if (tid == 0) {
-    local_lwe_array_out[lwe_dimension_out] =
-        block_lwe_array_in[lwe_dimension_in];
-  }
-
-  for (int i = 0; i < lwe_dimension_in; i++) {
-
-    __syncthreads();
-
-    Torus a_i =
-        round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
-
-    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
-    Torus mask_mod_b = (1ll << base_log) - 1ll;
-
-    for (int j = 0; j < level_count; j++) {
-      auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
-      Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-      for (int k = 0; k < lwe_part_per_thd; k++) {
-        int idx = tid + k * blockDim.x;
-        local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
+      for (int j = 0; j < level_count; j++) {
+        auto ksk_block =
+            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
+        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
+        local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
      }
    }
+
+    lwe_acc_out[shmem_index] = local_lwe_out;
  }

-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    block_lwe_array_out[idx] = local_lwe_array_out[idx];
+  if (tid <= lwe_dimension_out) {
+    for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
+         offset /= 2) {
+      __syncthreads();
+      lwe_acc_out[shmem_index] +=
+          lwe_acc_out[shmem_index + offset * blockDim.x];
+    }
+    if (threadIdx.y == 0)
+      block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
  }
 }

-/// assume lwe_array_in in the gpu
 template <typename Torus>
 __host__ void cuda_keyswitch_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t gpu_offset = 0) {

-  cudaSetDevice(stream->gpu_index);
-  constexpr int ideal_threads = 128;
+  cudaSetDevice(gpu_index);

-  int lwe_size = lwe_dimension_out + 1;
-  int lwe_lower, lwe_upper, cutoff;
-  if (lwe_size % ideal_threads == 0) {
-    lwe_lower = lwe_size / ideal_threads;
-    lwe_upper = lwe_size / ideal_threads;
-    cutoff = 0;
-  } else {
-    int y = ceil((double)lwe_size / (double)ideal_threads) * ideal_threads -
-            lwe_size;
-    cutoff = ideal_threads - y;
-    lwe_lower = lwe_size / ideal_threads;
-    lwe_upper = (int)ceil((double)lwe_size / (double)ideal_threads);
+  constexpr int num_threads_y = 32;
+  int num_blocks, num_threads_x;
+
+  getNumBlocksAndThreads2D(lwe_dimension_out + 1, 512, num_threads_y,
+                           num_blocks, num_threads_x);
+
+  int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
+  dim3 grid(num_blocks, num_samples, 1);
+  dim3 threads(num_threads_x, num_threads_y, 1);
+
+  keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
+      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
+                       uint32_t gpu_count, Torus *lwe_array_out,
+                       Torus *lwe_output_indexes, Torus *lwe_array_in,
+                       Torus *lwe_input_indexes, Torus **ksks,
+                       uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+                       uint32_t base_log, uint32_t level_count,
+                       uint32_t num_samples, bool sync_streams = true) {
+
+  /// If the number of radix blocks is lower than the number of GPUs, not all
+  /// GPUs will be active and there will be 1 input per GPU
+  auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
+  int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
+  if (sync_streams)
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+#pragma omp parallel for num_threads(active_gpu_count)
+  for (uint i = 0; i < active_gpu_count; i++) {
+    int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
+    int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
+
+    // Compute Keyswitch
+    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+        streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
+        lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
+        lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
+        gpu_offset);
  }

-  int lwe_size_after = lwe_size * num_samples;
-
-  int shared_mem = sizeof(Torus) * lwe_size;
-
-  cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
-  check_cuda_error(cudaGetLastError());
-
-  dim3 grid(num_samples, 1, 1);
-  dim3 threads(ideal_threads, 1, 1);
-
-  keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
-      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
-      lwe_upper, cutoff);
-  check_cuda_error(cudaGetLastError());
+  if (sync_streams)
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -3,14 +3,23 @@
 #include <cuda_runtime.h>

 /// Unsafe function to create a CUDA stream, must check first that GPU exists
-cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
+cudaStream_t cuda_create_stream(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
-  cuda_stream_t *stream = new cuda_stream_t(gpu_index);
+  cudaStream_t stream;
+  check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return stream;
 }

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
-void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
+void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamDestroy(stream));
+}
+
+void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamSynchronize(stream));
+}

 /// Unsafe function that will try to allocate even if gpu_index is invalid
 /// or if there's not enough memory. A safe wrapper around it must call
@@ -25,20 +34,20 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {

 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
+                        uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
  void *ptr;

 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
-                                          cudaDevAttrMemoryPoolsSupported,
-                                          stream->gpu_index));
+  check_cuda_error(cudaDeviceGetAttribute(
+      &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));

  if (support_async_alloc) {
-    check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
+    check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream));
  } else {
    check_cuda_error(cudaMalloc((void **)&ptr, size));
  }
@@ -71,46 +80,61 @@ bool cuda_check_support_cooperative_groups() {
  return cooperative_groups_supported > 0;
 }

+/// Returns
+///  false if Thread Block Cluster is not supported.
+///  true otherwise
+bool cuda_check_support_thread_block_clusters() {
+#if CUDA_ARCH >= 900
+  // To-do: Is this really the best way to check support?
+  int tbc_supported = 0;
+  check_cuda_error(
+      cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
+
+  return tbc_supported > 0;
+#else
+  return false;
+#endif
+}
+
 /// Copy memory to the GPU asynchronously
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                              cuda_stream_t *stream) {
+                              cudaStream_t stream, uint32_t gpu_index) {
  if (size == 0)
    return;
  cudaPointerAttributes attr;
  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
  }

-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
+      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }

 /// Copy memory within a GPU asynchronously
 void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                  cuda_stream_t *stream) {
+                                  cudaStream_t stream, uint32_t gpu_index) {
  if (size == 0)
    return;
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.device != stream->gpu_index &&
-      attr_dest.type != cudaMemoryTypeDevice) {
+  if (attr_dest.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
  }
  cudaPointerAttributes attr_src;
  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.device != stream->gpu_index &&
-      attr_src.type != cudaMemoryTypeDevice) {
+  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  if (attr_src.device != attr_dest.device) {
-    PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
+  check_cuda_error(cudaSetDevice(gpu_index));
+  if (attr_src.device == attr_dest.device) {
+    check_cuda_error(
+        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
+                                         attr_src.device, size, stream));
  }
-
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-  check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
-                                   stream->stream));
 }

 /// Synchronizes device
@@ -120,16 +144,16 @@ void cuda_synchronize_device(uint32_t gpu_index) {
 }

 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                       cuda_stream_t *stream) {
+                       cudaStream_t stream, uint32_t gpu_index) {
  if (size == 0)
    return;
  cudaPointerAttributes attr;
  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
  }
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-  check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

 template <typename Torus>
@@ -140,42 +164,45 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
 }

 template <typename Torus>
-void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
-                          Torus n) {
+void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                          Torus *d_array, Torus value, Torus n) {
  cudaPointerAttributes attr;
  check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
  if (attr.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid dest device pointer in cuda set value.")
  }
+  check_cuda_error(cudaSetDevice(gpu_index));
  int block_size = 256;
  int num_blocks = (n + block_size - 1) / block_size;

  // Launch the kernel
-  cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
-                                                                n);
+  cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
+                                                               n);
  check_cuda_error(cudaGetLastError());
 }

 /// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
-template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
-                                   uint64_t value, uint64_t n);
-template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
-                                   uint32_t value, uint32_t n);
+template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                                   uint64_t *d_array, uint64_t value,
+                                   uint64_t n);
+template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                                   uint32_t *d_array, uint32_t value,
+                                   uint32_t n);

 /// Copy memory to the CPU asynchronously
 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                              cuda_stream_t *stream) {
+                              cudaStream_t stream, uint32_t gpu_index) {
  if (size == 0)
    return;
  cudaPointerAttributes attr;
  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
  }

-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
+      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }

 /// Return number of GPUs available
@@ -192,19 +219,18 @@ void cuda_drop(void *ptr, uint32_t gpu_index) {
 }

 /// Drop a cuda array asynchronously, if supported on the device
-void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
+void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  check_cuda_error(cudaSetDevice(gpu_index));
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
-                                          cudaDevAttrMemoryPoolsSupported,
-                                          stream->gpu_index));
+  check_cuda_error(cudaDeviceGetAttribute(
+      &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));

  if (support_async_alloc) {
-    check_cuda_error(cudaFreeAsync(ptr, stream->stream));
+    check_cuda_error(cudaFreeAsync(ptr, stream));
  } else {
    check_cuda_error(cudaFree(ptr));
  }
@@ -223,13 +249,11 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
  return max_shared_memory;
 }

-void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
-
-void cuda_stream_add_callback(cuda_stream_t *stream,
+void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
                              cudaStreamCallback_t callback, void *user_data) {

-  check_cuda_error(
-      cudaStreamAddCallback(stream->stream, callback, user_data, 0));
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
 }

 void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,13 +1,13 @@
 #include "integer/bitwise_ops.cuh"

 void scratch_cuda_integer_radix_bitop_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    BITOP_TYPE op_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -15,37 +15,42 @@ void scratch_cuda_integer_radix_bitop_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_bitop_kb<uint64_t>(
-      stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
-      params, op_type, allocate_gpu_memory);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
+      op_type, allocate_gpu_memory);
 }

 void cuda_bitop_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
-    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitop_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_1),
      static_cast<uint64_t *>(lwe_array_2),
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      lwe_ciphertext_count);
 }

 void cuda_bitnot_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void **bsks,
+    void **ksks, uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitnot_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_in),
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      lwe_ciphertext_count);
 }

-void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_bitop(void **streams, uint32_t *gpu_indexes,
+                                uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -13,40 +13,41 @@

 template <typename Torus>
 __host__ void
-host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                            uint32_t gpu_count, Torus *lwe_array_out,
                            Torus *lwe_array_1, Torus *lwe_array_2,
-                            int_bitop_buffer<Torus> *mem_ptr, void *bsk,
-                            Torus *ksk, uint32_t num_radix_blocks) {
+                            int_bitop_buffer<Torus> *mem_ptr, void **bsks,
+                            Torus **ksks, uint32_t num_radix_blocks) {

  auto lut = mem_ptr->lut;

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
-      num_radix_blocks, lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
+      bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus);
 }

 template <typename Torus>
-__host__ void
-host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                             Torus *lwe_array_in,
-                             int_bitop_buffer<Torus> *mem_ptr, void *bsk,
-                             Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void host_integer_radix_bitnot_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer<Torus> *mem_ptr,
+    void **bsks, Torus **ksks, uint32_t num_radix_blocks) {

  auto lut = mem_ptr->lut;

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
+      num_radix_blocks, lut);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_bitop_kb(
-    cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_bitop_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, BITOP_TYPE op, bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
-                                         allocate_gpu_memory);
+  *mem_ptr =
+      new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op, params,
+                                  num_radix_blocks, allocate_gpu_memory);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,12 +1,13 @@
 #include "integer/cmux.cuh"

 void scratch_cuda_integer_radix_cmux_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -17,29 +18,33 @@ void scratch_cuda_integer_radix_cmux_kb_64(
      [](uint64_t x) -> uint64_t { return x == 1; };

  scratch_cuda_integer_radix_cmux_kb(
-      stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
 }

 void cuda_cmux_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
-    void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
+    void *lwe_array_false, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t lwe_ciphertext_count) {

  host_integer_radix_cmux_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_condition),
      static_cast<uint64_t *>(lwe_array_true),
      static_cast<uint64_t *>(lwe_array_false),
-      (int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),

      lwe_ciphertext_count);
 }

-void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_cmux(void **streams, uint32_t *gpu_indexes,
+                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {

  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -5,12 +5,13 @@
 #include <omp.h>

 template <typename Torus>
-__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
+__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
+                          uint32_t gpu_count, Torus *lwe_array_out,
                          Torus *lwe_array_input, Torus *lwe_condition,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
-                          int_radix_lut<Torus> *predicate, void *bsk,
-                          Torus *ksk, uint32_t num_radix_blocks) {
-  cudaSetDevice(stream->gpu_index);
+                          int_radix_lut<Torus> *predicate, void **bsks,
+                          Torus **ksks, uint32_t num_radix_blocks) {
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

  int big_lwe_size = params.big_lwe_dimension + 1;
@@ -27,8 +28,7 @@ __host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;

-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
-                                   stream->stream>>>(
+    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
        params.message_modulus, 1);
@@ -36,25 +36,26 @@ __host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
  }

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
-      predicate);
+      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
+      ksks, num_radix_blocks, predicate);
 }

 template <typename Torus>
-__host__ void
-host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                           Torus *lwe_condition, Torus *lwe_array_true,
-                           Torus *lwe_array_false,
-                           int_cmux_buffer<Torus> *mem_ptr, void *bsk,
-                           Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void host_integer_radix_cmux_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
+    Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks) {

  auto params = mem_ptr->params;

  // Since our CPU threads will be working on different streams we shall assert
  // the work in the main stream is completed
-  stream->synchronize();
-  auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
-  auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
+  auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
+  auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }

 #pragma omp parallel sections
  {
@@ -62,41 +63,46 @@ host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
 #pragma omp section
    {
      auto mem_true = mem_ptr->zero_if_true_buffer;
-      zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
-                  lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
-                  ksk, num_radix_blocks);
+      zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                  lwe_array_true, lwe_condition, mem_true,
+                  mem_ptr->inverted_predicate_lut, bsks, ksks,
+                  num_radix_blocks);
    }
 #pragma omp section
    {
      auto mem_false = mem_ptr->zero_if_false_buffer;
-      zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
-                  lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
-                  num_radix_blocks);
+      zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
+                  lwe_array_false, lwe_condition, mem_false,
+                  mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
    }
  }
-  cuda_synchronize_stream(true_stream);
-  cuda_synchronize_stream(false_stream);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
+    cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
+  }

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
-                params.big_lwe_dimension, num_radix_blocks);
+  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
+                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
+                num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
-      mem_ptr->message_extract_lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
+      num_radix_blocks, mem_ptr->message_extract_lut);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_cmux_kb(
-    cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_cmux_buffer<Torus> **mem_ptr,
    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
+  *mem_ptr = new int_cmux_buffer<Torus>(streams, gpu_indexes, gpu_count,
+                                        predicate_lut_f, params,
                                        num_radix_blocks, allocate_gpu_memory);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,13 +1,13 @@
 #include "integer/comparison.cuh"

 void scratch_cuda_integer_radix_comparison_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -18,8 +18,9 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  case EQ:
  case NE:
    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
-        params, op_type, false, allocate_gpu_memory);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
+        op_type, false, allocate_gpu_memory);
    break;
  case GT:
  case GE:
@@ -28,16 +29,17 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  case MAX:
  case MIN:
    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
-        params, op_type, is_signed, allocate_gpu_memory);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
+        op_type, is_signed, allocate_gpu_memory);
    break;
  }
 }

 void cuda_comparison_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
-    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t num_radix_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_radix_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -45,39 +47,43 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  case EQ:
  case NE:
    host_integer_radix_equality_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
-        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), num_radix_blocks);
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
+        num_radix_blocks);
    break;
  case GT:
  case GE:
  case LT:
  case LE:
    host_integer_radix_difference_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer,
-        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
+        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
        num_radix_blocks);
    break;
  case MAX:
  case MIN:
    host_integer_radix_maxmin_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
-        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), num_radix_blocks);
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsks, (uint64_t **)(ksks),
+        num_radix_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
  }
 }

-void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
+void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
+                                     uint32_t gpu_count,
                                     int8_t **mem_ptr_void) {

  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -33,16 +33,17 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
 }

 template <typename Torus>
-__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
-                                    Torus *input, uint32_t lwe_dimension,
+__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
+                                    Torus *output, Torus *input,
+                                    uint32_t lwe_dimension,
                                    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  // Add all blocks and store in sum
-  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
+  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
      output, input, lwe_dimension, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }
@@ -55,13 +56,13 @@ __host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
 *
 */
 template <typename Torus>
-__host__ void
-are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
-                               Torus *lwe_array_in,
-                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                               Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void are_all_comparisons_block_true(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -76,9 +77,10 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
  uint32_t total_modulus = message_modulus * carry_modulus;
  uint32_t max_value = total_modulus - 1;

-  cuda_memcpy_async_gpu_to_gpu(
-      tmp_out, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
+  cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);

  uint32_t remaining_blocks = num_radix_blocks;

@@ -91,17 +93,17 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
    // as in the worst case we will be adding `max_value` ones
    auto input_blocks = tmp_out;
    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
+    auto is_equal_to_num_blocks_map =
+        &are_all_block_true_buffer->is_equal_to_lut_map;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(stream, accumulator, input_blocks,
-                            big_lwe_dimension, chunk_length);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
+                            input_blocks, big_lwe_dimension, chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
      input_blocks += (big_lwe_dimension + 1) * chunk_length;
    }
    accumulator = are_all_block_true_buffer->tmp_block_accumulated;
-    auto is_equal_to_num_blocks_map =
-        &are_all_block_true_buffer->is_equal_to_lut_map;

    // Selects a LUT
    int_radix_lut<Torus> *lut;
@@ -115,16 +117,20 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
        lut = (*is_equal_to_num_blocks_map)[chunk_length];
      } else {
        // LUT needs to be computed
-        auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
-                                                num_radix_blocks, true);
+        auto new_lut =
+            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
+                                     max_value, num_radix_blocks, true);

        auto is_equal_to_num_blocks_lut_f = [max_value,
                                             chunk_length](Torus x) -> Torus {
          return (x & max_value) == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            stream, new_lut->lut, glwe_dimension, polynomial_size,
-            message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
+            streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
+            glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+            is_equal_to_num_blocks_lut_f);
+
+        new_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
        lut = new_lut;
@@ -135,11 +141,13 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
+          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
+          ksks, 1, lut);
      return;
    } else {
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, tmp_out, accumulator, bsk, ksk, num_chunks, lut);
+          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
+          num_chunks, lut);
    }
  }
 }
@@ -152,9 +160,12 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
 */
 template <typename Torus>
 __host__ void is_at_least_one_comparisons_block_true(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {
+
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -165,9 +176,10 @@ __host__ void is_at_least_one_comparisons_block_true(
  uint32_t total_modulus = message_modulus * carry_modulus;
  uint32_t max_value = total_modulus - 1;

-  cuda_memcpy_async_gpu_to_gpu(
-      mem_ptr->tmp_lwe_array_out, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);

  uint32_t remaining_blocks = num_radix_blocks;
  while (remaining_blocks > 0) {
@@ -180,8 +192,8 @@ __host__ void is_at_least_one_comparisons_block_true(
    auto input_blocks = mem_ptr->tmp_lwe_array_out;
    auto accumulator = buffer->tmp_block_accumulated;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(stream, accumulator, input_blocks,
-                            big_lwe_dimension, chunk_length);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
+                            input_blocks, big_lwe_dimension, chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
@@ -196,12 +208,13 @@ __host__ void is_at_least_one_comparisons_block_true(
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
+          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
+          ksks, 1, lut);
      return;
    } else {
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, mem_ptr->tmp_lwe_array_out, accumulator, bsk, ksk, num_chunks,
-          lut);
+          streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+          accumulator, bsks, ksks, num_chunks, lut);
    }
  }
 }
@@ -227,11 +240,12 @@ __host__ void is_at_least_one_comparisons_block_true(
 // are_all_comparisons_block_true
 template <typename Torus>
 __host__ void host_compare_with_zero_equality(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -256,7 +270,8 @@ __host__ void host_compare_with_zero_equality(

  if (num_radix_blocks == 1) {
    // Just copy
-    cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
+    cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes,
+                                 streams[0], gpu_indexes[0]);
    num_sum_blocks = 1;
  } else {
    uint32_t remainder_blocks = num_radix_blocks;
@@ -266,8 +281,8 @@ __host__ void host_compare_with_zero_equality(
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

-      accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
-                            chunk_size);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
+                            big_lwe_dimension, chunk_size);

      num_sum_blocks++;
      remainder_blocks -= (chunk_size - 1);
@@ -279,40 +294,44 @@ __host__ void host_compare_with_zero_equality(
  }

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
-  are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
-                                 num_sum_blocks);
+      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
+      zero_comparison);
+  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                 sum, mem_ptr, bsks, ksks, num_sum_blocks);
 }

 template <typename Torus>
 __host__ void host_integer_radix_equality_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
-    Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto eq_buffer = mem_ptr->eq_buffer;

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
  integer_radix_apply_bivariate_lookup_table_kb(
-      stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
-      eq_buffer->operator_lut);
+      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
+      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
+      eq_buffer->operator_lut->params.message_modulus);

  // This takes a Vec of blocks, where each block is either 0 or 1.
  //
  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
-                                 bsk, ksk, num_radix_blocks);
+  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                 comparisons, mem_ptr, bsks, ksks,
+                                 num_radix_blocks);
 }

 template <typename Torus>
 __host__ void
-compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *lwe_array_out,
                        Torus *lwe_array_left, Torus *lwe_array_right,
-                        int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                        Torus *ksk, uint32_t num_radix_blocks) {
+                        int_comparison_buffer<Torus> *mem_ptr, void **bsks,
+                        Torus **ksks, uint32_t num_radix_blocks) {

  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -334,21 +353,21 @@ compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,

  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
-                   big_lwe_dimension, num_radix_blocks);
+  host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
+                   lwe_array_right, big_lwe_dimension, num_radix_blocks);

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb(
-      stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
-      is_non_zero_lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
+      num_radix_blocks, is_non_zero_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
-                                            big_lwe_dimension, num_radix_blocks,
-                                            message_modulus, carry_modulus);
+  host_integer_radix_add_scalar_one_inplace(
+      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
 }

 // Reduces a vec containing shortint blocks that encrypts a sign
@@ -356,13 +375,13 @@ compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
 // final sign
 template <typename Torus>
 __host__ void
-tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
+tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
+                    uint32_t gpu_count, Torus *lwe_array_out,
                    Torus *lwe_block_comparisons,
                    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
-                    std::function<Torus(Torus)> sign_handler_f, void *bsk,
-                    Torus *ksk, uint32_t num_radix_blocks) {
+                    std::function<Torus(Torus)> sign_handler_f, void **bsks,
+                    Torus **ksks, uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = tree_buffer->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -381,16 +400,19 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
  auto y = tree_buffer->tmp_y;
  if (x != lwe_block_comparisons)
    cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
-                                 big_lwe_size_bytes * num_radix_blocks, stream);
+                                 big_lwe_size_bytes * num_radix_blocks,
+                                 streams[0], gpu_indexes[0]);

  uint32_t partial_block_count = num_radix_blocks;

  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
-    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
+    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                partial_block_count, 4);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
+        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
+        partial_block_count >> 1, inner_tree_leaf);

    if ((partial_block_count % 2) != 0) {
      partial_block_count >>= 1;
@@ -400,7 +422,8 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
      auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;

      cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
-                                   big_lwe_size_bytes, stream);
+                                   big_lwe_size_bytes, streams[0],
+                                   gpu_indexes[0]);
    } else {
      partial_block_count >>= 1;
    }
@@ -411,7 +434,8 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
  std::function<Torus(Torus)> f;

  if (partial_block_count == 2) {
-    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
+    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                partial_block_count, 4);

    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
      int msb = (x >> 2) & 3;
@@ -425,23 +449,25 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
    y = x;
    f = sign_handler_f;
  }
-  generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
-                                     polynomial_size, message_modulus,
-                                     carry_modulus, f);
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], last_lut->get_lut(gpu_indexes[0], 0),
+      glwe_dimension, polynomial_size, message_modulus, carry_modulus, f);
+  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
-                                                 ksk, 1, last_lut);
+  integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
+                                                 gpu_count, lwe_array_out, y,
+                                                 bsks, ksks, 1, last_lut);
 }

 template <typename Torus>
 __host__ void host_integer_radix_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
-    Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
+    int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> reduction_lut_f, void **bsks, Torus **ksks,
    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
@@ -463,21 +489,21 @@ __host__ void host_integer_radix_difference_check_kb(
    if (mem_ptr->is_signed) {
      packed_num_radix_blocks -= 2;
    }
-    pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
-                packed_num_radix_blocks, message_modulus);
-    pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
-                packed_num_radix_blocks, message_modulus);
+    pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
+                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
+    pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
+                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
    // From this point we have half number of blocks
    packed_num_radix_blocks /= 2;

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_left, packed_left, bsk, ksk, packed_num_radix_blocks,
-        identity_lut);
+        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
+        packed_num_radix_blocks, identity_lut);
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_right, packed_right, bsk, ksk, packed_num_radix_blocks,
-        identity_lut);
+        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
+        packed_num_radix_blocks, identity_lut);

    lhs = packed_left;
    rhs = packed_right;
@@ -492,14 +518,15 @@ __host__ void host_integer_radix_difference_check_kb(
  if (!mem_ptr->is_signed) {
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
-    compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
-                            packed_num_radix_blocks);
+    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
+                            rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
+      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
+                              rhs, mem_ptr, bsks, ksks,
                              packed_num_radix_blocks);

      // Compare the last block before the sign block separately
@@ -510,35 +537,38 @@ __host__ void host_integer_radix_difference_check_kb(
          diff_buffer->tmp_packed_right +
          packed_num_radix_blocks * big_lwe_size;
      integer_radix_apply_univariate_lookup_table_kb(
-          stream, last_left_block_before_sign_block,
-          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
+          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
+          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
      integer_radix_apply_univariate_lookup_table_kb(
-          stream, last_right_block_before_sign_block,
-          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
-          identity_lut);
+          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
+          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
+          1, identity_lut);
      compare_radix_blocks_kb(
-          stream, comparisons + packed_num_radix_blocks * big_lwe_size,
+          streams, gpu_indexes, gpu_count,
+          comparisons + packed_num_radix_blocks * big_lwe_size,
          last_left_block_before_sign_block, last_right_block_before_sign_block,
-          mem_ptr, bsk, ksk, 1);
+          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
      integer_radix_apply_bivariate_lookup_table_kb(
-          stream, comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
+          streams, gpu_indexes, gpu_count,
+          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
-          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
-          mem_ptr->signed_lut);
+          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks,
+          1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
-      compare_radix_blocks_kb(stream, comparisons, lwe_array_left,
-                              lwe_array_right, mem_ptr, bsk, ksk,
-                              num_radix_blocks - 1);
+      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
+                              lwe_array_left, lwe_array_right, mem_ptr, bsks,
+                              ksks, num_radix_blocks - 1);
      // Compare the sign block separately
      integer_radix_apply_bivariate_lookup_table_kb(
-          stream, comparisons + (num_radix_blocks - 1) * big_lwe_size,
+          streams, gpu_indexes, gpu_count,
+          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
-          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
-          mem_ptr->signed_lut);
+          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsks, ksks,
+          1, mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
      num_comparisons = num_radix_blocks;
    }
  }
@@ -546,39 +576,42 @@ __host__ void host_integer_radix_difference_check_kb(
  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
-  tree_sign_reduction(stream, lwe_array_out, comparisons,
-                      mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
-                      ksk, num_comparisons);
+  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
+                      comparisons, mem_ptr->diff_buffer->tree_buffer,
+                      reduction_lut_f, bsks, ksks, num_comparisons);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_comparison_check_kb(
-    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
-    bool is_signed, bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, COMPARISON_TYPE op, bool is_signed,
+    bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_comparison_buffer<Torus>(
-      stream, op, params, num_radix_blocks, is_signed, allocate_gpu_memory);
+  *mem_ptr = new int_comparison_buffer<Torus>(streams, gpu_indexes, gpu_count,
+                                              op, params, num_radix_blocks,
+                                              is_signed, allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void
-host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count, Torus *lwe_array_out,
                             Torus *lwe_array_left, Torus *lwe_array_right,
-                             int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                             Torus *ksk, uint32_t total_num_radix_blocks) {
+                             int_comparison_buffer<Torus> *mem_ptr, void **bsks,
+                             Torus **ksks, uint32_t total_num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  // Compute the sign
  host_integer_radix_difference_check_kb(
-      stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
-      mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks);
+      streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
+      ksks, total_num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb(
-      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
+  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
+                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
+                             total_num_radix_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -0,0 +1,85 @@
+#include "integer/div_rem.cuh"
+
+void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_div_rem_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      allocate_gpu_memory);
+}
+
+void cuda_integer_div_rem_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
+    void *remainder, void *numerator, void *divisor, int8_t *mem_ptr,
+    void **bsks, void **ksks, uint32_t num_blocks) {
+
+  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
+
+  switch (mem->params.polynomial_size) {
+  case 512:
+    host_integer_div_rem_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
+    break;
+  case 1024:
+
+    host_integer_div_rem_kb<uint64_t, Degree<1024>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
+    break;
+  case 2048:
+    host_integer_div_rem_kb<uint64_t, Degree<2048>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
+    break;
+  case 4096:
+    host_integer_div_rem_kb<uint64_t, Degree<4096>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
+    break;
+  case 8192:
+    host_integer_div_rem_kb<uint64_t, Degree<8192>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
+    break;
+  case 16384:
+    host_integer_div_rem_kb<uint64_t, Degree<16384>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsks, (uint64_t **)(ksks), mem, num_blocks);
+    break;
+  default:
+    PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
+          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
+  }
+}
+
+void cleanup_cuda_integer_div_rem(void **streams, uint32_t *gpu_indexes,
+                                  uint32_t gpu_count, int8_t **mem_ptr_void) {
+  int_div_rem_memory<uint64_t> *mem_ptr =
+      (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -0,0 +1,641 @@
+#ifndef TFHE_RS_DIV_REM_CUH
+#define TFHE_RS_DIV_REM_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/negation.cuh"
+#include "integer/scalar_shifts.cuh"
+#include "linear_algebra.h"
+#include "programmable_bootstrap.h"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+int ceil_div(int a, int b) { return (a + b - 1) / b; }
+
+// struct makes it easier to use list of ciphertexts and move data between them
+// struct does not allocate or drop any memory,
+// keeps track on number of ciphertexts inside list.
+template <typename Torus> struct lwe_ciphertext_list {
+  Torus *data;
+  size_t max_blocks;
+  size_t len;
+  int_radix_params params;
+
+  size_t big_lwe_size;
+  size_t radix_size;
+  size_t big_lwe_size_bytes;
+  size_t radix_size_bytes;
+  size_t big_lwe_dimension;
+
+  lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
+      : data(src), params(params), max_blocks(max_blocks) {
+    big_lwe_size = params.big_lwe_dimension + 1;
+    big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+    radix_size = max_blocks * big_lwe_size;
+    radix_size_bytes = radix_size * sizeof(Torus);
+    big_lwe_dimension = params.big_lwe_dimension;
+    len = max_blocks;
+  }
+
+  // copies ciphertexts from Torus*, starting from `starting_block` including
+  // `finish_block`, does not change the value of self len
+  void copy_from(Torus *src, size_t start_block, size_t finish_block,
+                 cudaStream_t stream, uint32_t gpu_index) {
+    size_t tmp_len = finish_block - start_block + 1;
+    cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
+                                 tmp_len * big_lwe_size_bytes, stream,
+                                 gpu_index);
+  }
+
+  // copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
+  // including `finish_block`, does not change the value of self len
+  void copy_from(const lwe_ciphertext_list &src, size_t start_block,
+                 size_t finish_block, cudaStream_t stream, uint32_t gpu_index) {
+    copy_from(src.data, start_block, finish_block, stream, gpu_index);
+  }
+
+  // copies ciphertexts from Torus*, starting from `starting_block`
+  // including `finish_block`, updating the value of self len
+  void clone_from(Torus *src, size_t start_block, size_t finish_block,
+                  cudaStream_t stream, uint32_t gpu_index) {
+    len = finish_block - start_block + 1;
+
+    cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
+                                 len * big_lwe_size_bytes, stream, gpu_index);
+  }
+
+  // copies ciphertexts from ciphertexts_list, starting from `starting_block`
+  // including `finish_block`, updating the value of self len
+  void clone_from(const lwe_ciphertext_list &src, size_t start_block,
+                  size_t finish_block, cudaStream_t stream,
+                  uint32_t gpu_index) {
+    clone_from(src.data, start_block, finish_block, stream, gpu_index);
+  }
+
+  // assign zero to blocks starting from `start_block` including `finish_block`
+  void assign_zero(size_t start_block, size_t finish_block, cudaStream_t stream,
+                   uint32_t gpu_index) {
+    auto size = finish_block - start_block + 1;
+    cuda_memset_async(&data[start_block * big_lwe_size], 0,
+                      size * big_lwe_size_bytes, stream, gpu_index);
+  }
+
+  // return pointer to last block
+  Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
+
+  // return pointer to first_block
+  Torus *first_block() { return data; }
+
+  // return block with `index`
+  Torus *get_block(size_t index) {
+    assert(index < len);
+    return &data[index * big_lwe_size];
+  }
+
+  bool is_empty() { return len == 0; }
+
+  // does not dop actual memory from `data`, only reduces value of `len` by one
+  void pop() {
+    if (len > 0)
+      len--;
+    else
+      assert(len > 0);
+  }
+
+  // insert ciphertext at index `ind`
+  void insert(size_t ind, Torus *ciphertext_block, cudaStream_t stream,
+              uint32_t gpu_index) {
+    assert(ind <= len);
+    assert(len < max_blocks);
+
+    size_t insert_offset = ind * big_lwe_size;
+
+    for (size_t i = len; i > ind; i--) {
+      Torus *src = &data[(i - 1) * big_lwe_size];
+      Torus *dst = &data[i * big_lwe_size];
+      cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream,
+                                   gpu_index);
+    }
+
+    cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
+                                 big_lwe_size_bytes, stream, gpu_index);
+    len++;
+  }
+
+  // push ciphertext at the end of `data`
+  void push(Torus *ciphertext_block, cudaStream_t stream, uint32_t gpu_index) {
+    assert(len < max_blocks);
+
+    size_t offset = len * big_lwe_size;
+    cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
+                                 big_lwe_size_bytes, stream, gpu_index);
+    len++;
+  }
+
+  // duplicate ciphertext into `number_of_blocks` ciphertexts
+  void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
+                                 cudaStream_t stream, uint32_t gpu_index) {
+    assert(number_of_blocks <= max_blocks);
+
+    for (size_t i = 0; i < number_of_blocks; i++) {
+      Torus *dest = &data[i * big_lwe_size];
+      cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes, stream,
+                                   gpu_index);
+    }
+
+    len = number_of_blocks;
+  }
+
+  // used for debugging, prints body of each ciphertext.
+  void print_blocks_body(const char *name) {
+    for (int i = 0; i < len; i++) {
+      print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
+    }
+  }
+};
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_div_rem_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_div_rem_memory<Torus> **mem_ptr, uint32_t num_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_div_rem_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus, class params>
+__host__ void
+host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *quotient, Torus *remainder,
+                        Torus *numerator, Torus *divisor, void **bsks,
+                        uint64_t **ksks, int_div_rem_memory<uint64_t> *mem_ptr,
+                        uint32_t num_blocks) {
+
+  auto radix_params = mem_ptr->params;
+
+  auto big_lwe_dimension = radix_params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  uint32_t message_modulus = radix_params.message_modulus;
+  uint32_t carry_modulus = radix_params.carry_modulus;
+  uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
+  uint32_t total_bits = num_bits_in_message * num_blocks;
+
+  // put temporary buffers in lwe_ciphertext_list for easy use
+  lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
+                                        num_blocks);
+  lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
+                                        num_blocks);
+  lwe_ciphertext_list<Torus> numerator_block_stack(
+      mem_ptr->numerator_block_stack, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
+                                               radix_params, 1);
+  lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
+                                       num_blocks + 1);
+  lwe_ciphertext_list<Torus> interesting_remainder1(
+      mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
+  lwe_ciphertext_list<Torus> interesting_remainder2(
+      mem_ptr->interesting_remainder2, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
+                                                 radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
+                                               radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
+                                           num_blocks);
+  lwe_ciphertext_list<Torus> subtraction_overflowed(
+      mem_ptr->subtraction_overflowed, radix_params, 1);
+  lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
+                                              radix_params, 1);
+  lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
+                                          1);
+  lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
+                                                radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
+      mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
+  lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
+      mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
+
+  numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
+                                   gpu_indexes[0]);
+  remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
+  remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
+
+  cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, streams[0],
+                    gpu_indexes[0]);
+
+  for (int i = total_bits - 1; i >= 0; i--) {
+    uint32_t block_of_bit = i / num_bits_in_message;
+    uint32_t pos_in_block = i % num_bits_in_message;
+    uint32_t msb_bit_set = total_bits - 1 - i;
+    uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
+
+    // Index to the first block of the remainder that is fully trivial 0
+    // and all blocks after it are also trivial zeros
+    // This number is in range 1..=num_bocks -1
+    uint32_t first_trivial_block = last_non_trivial_block + 1;
+
+    interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
+                                      streams[0], gpu_indexes[0]);
+    interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
+                                      streams[0], gpu_indexes[0]);
+    interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
+                                   streams[0], gpu_indexes[0]);
+    divisor_ms_blocks.clone_from(divisor,
+                                 (msb_bit_set + 1) / num_bits_in_message,
+                                 num_blocks - 1, streams[0], gpu_indexes[0]);
+
+    // We split the divisor at a block position, when in reality the split
+    // should be at a bit position meaning that potentially (depending on
+    // msb_bit_set) the split versions share some bits they should not. So we do
+    // one PBS on the last block of the interesting_divisor, and first block of
+    // divisor_ms_blocks to trim out bits which should not be there
+    auto trim_last_interesting_divisor_bits =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          if ((msb_bit_set + 1) % num_bits_in_message == 0) {
+            return;
+          }
+          // The last block of the interesting part of the remainder
+          // can contain bits which we should not account for
+          // we have to zero them out.
+
+          // Where the msb is set in the block
+          uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+
+          // e.g 2 bits in message:
+          // if pos_in_block is 0, then we want to keep only first bit (right
+          // shift
+          // mask by 1) if pos_in_block is 1, then we want to keep the two
+          // bits
+          // (right shift mask by 0)
+          uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
+
+          // Create mask of 1s on the message part, 0s in the carries
+          uint32_t full_message_mask = message_modulus - 1;
+
+          // Shift the mask so that we will only keep bits we should
+          uint32_t shifted_mask = full_message_mask >> shift_amount;
+
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
+              interesting_divisor.last_block(), bsks, ksks, 1,
+              mem_ptr->masking_luts_1[shifted_mask]);
+        }; // trim_last_interesting_divisor_bits
+
+    auto trim_first_divisor_ms_bits =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          if (divisor_ms_blocks.is_empty() ||
+              ((msb_bit_set + 1) % num_bits_in_message) == 0) {
+            return;
+          }
+          // Where the msb is set in the block
+          uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+
+          // e.g 2 bits in message:
+          // if pos_in_block is 0, then we want to discard the first bit (left
+          // shift mask by 1) if pos_in_block is 1, then we want to discard the
+          // two bits (left shift mask by 2) let shift_amount =
+          // num_bits_in_message - pos_in_block
+          uint32_t shift_amount = pos_in_block + 1;
+          uint32_t full_message_mask = message_modulus - 1;
+          uint32_t shifted_mask = full_message_mask << shift_amount;
+
+          // Keep the mask within the range of message bits, so that
+          // the estimated degree of the output is < msg_modulus
+          shifted_mask = shifted_mask & full_message_mask;
+
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
+              divisor_ms_blocks.first_block(), bsks, ksks, 1,
+              mem_ptr->masking_luts_2[shifted_mask]);
+        }; // trim_first_divisor_ms_bits
+
+    // This does
+    //  R := R << 1; R(0) := N(i)
+    //
+    // We could to that by left shifting, R by one, then unchecked_add the
+    // correct numerator bit.
+    //
+    // However, to keep the remainder clean (noise wise), what we do is that we
+    // put the remainder block from which we need to extract the bit, as the LSB
+    // of the Remainder, so that left shifting will pull the bit we need.
+    auto left_shift_interesting_remainder1 =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          numerator_block_1.clone_from(
+              numerator_block_stack, numerator_block_stack.len - 1,
+              numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
+          numerator_block_stack.pop();
+          interesting_remainder1.insert(0, numerator_block_1.first_block(),
+                                        streams[0], gpu_indexes[0]);
+
+          host_integer_radix_logical_scalar_shift_kb_inplace(
+              streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
+              mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
+
+          tmp_radix.clone_from(interesting_remainder1, 0,
+                               interesting_remainder1.len - 1, streams[0],
+                               gpu_indexes[0]);
+
+          host_radix_blocks_rotate_left(
+              streams, gpu_indexes, gpu_count, interesting_remainder1.data,
+              tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
+
+          numerator_block_1.clone_from(
+              interesting_remainder1, interesting_remainder1.len - 1,
+              interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
+
+          interesting_remainder1.pop();
+
+          if (pos_in_block != 0) {
+            // We have not yet extracted all the bits from this numerator
+            // so, we put it back on the front so that it gets taken next
+            // iteration
+            numerator_block_stack.push(numerator_block_1.first_block(),
+                                       streams[0], gpu_indexes[0]);
+          }
+        }; // left_shift_interesting_remainder1
+
+    auto left_shift_interesting_remainder2 =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          host_integer_radix_logical_scalar_shift_kb_inplace(
+              streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
+              mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
+        }; // left_shift_interesting_remainder2
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // interesting_divisor
+        trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
+                                           gpu_count);
+      }
+#pragma omp section
+      {
+        // divisor_ms_blocks
+        trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes,
+                                   gpu_count);
+      }
+#pragma omp section
+      {
+        // interesting_remainder1
+        // numerator_block_stack
+        left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
+                                          gpu_count);
+      }
+#pragma omp section
+      {
+        // interesting_remainder2
+        left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
+                                          gpu_count);
+      }
+    }
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
+    }
+
+    // if interesting_remainder1 != 0 -> interesting_remainder2 == 0
+    // if interesting_remainder1 == 0 -> interesting_remainder2 != 0
+    // In practice interesting_remainder1 contains the numerator bit,
+    // but in that position, interesting_remainder2 always has a 0
+    auto &merged_interesting_remainder = interesting_remainder1;
+
+    host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
+                  merged_interesting_remainder.data,
+                  interesting_remainder2.data, radix_params.big_lwe_dimension,
+                  merged_interesting_remainder.len);
+
+    // after create_clean_version_of_merged_remainder
+    // `merged_interesting_remainder` will be reused as
+    // `cleaned_merged_interesting_remainder`
+    cleaned_merged_interesting_remainder.clone_from(
+        merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
+        streams[0], gpu_indexes[0]);
+
+    assert(merged_interesting_remainder.len == interesting_divisor.len);
+
+    // `new_remainder` is not initialized yet, so need to set length
+    new_remainder.len = merged_interesting_remainder.len;
+
+    // fills:
+    //  `new_remainder` - radix ciphertext
+    //  `subtraction_overflowed` - single ciphertext
+    auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
+                                  uint32_t gpu_count) {
+      host_integer_overflowing_sub_kb<Torus, params>(
+          streams, gpu_indexes, gpu_count, new_remainder.data,
+          subtraction_overflowed.data, merged_interesting_remainder.data,
+          interesting_divisor.data, bsks, ksks, mem_ptr->overflow_sub_mem,
+          merged_interesting_remainder.len);
+    };
+
+    // fills:
+    //  `at_least_one_upper_block_is_non_zero` - single ciphertext
+    auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
+                                          uint32_t *gpu_indexes,
+                                          uint32_t gpu_count) {
+      auto &trivial_blocks = divisor_ms_blocks;
+      if (trivial_blocks.is_empty()) {
+        cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
+                          big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+      } else {
+
+        // We could call unchecked_scalar_ne
+        // But we are in the special case where scalar == 0
+        // So we can skip some stuff
+        host_compare_with_zero_equality(
+            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
+            mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
+            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
+
+        tmp_1.len =
+            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
+
+        is_at_least_one_comparisons_block_true(
+            streams, gpu_indexes, gpu_count,
+            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
+            mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
+      }
+    };
+
+    // Creates a cleaned version (noise wise) of the merged remainder
+    // so that it can be safely used in bivariate PBSes
+    // fills:
+    //  `cleaned_merged_interesting_remainder` - radix ciphertext
+    auto create_clean_version_of_merged_remainder =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count,
+              cleaned_merged_interesting_remainder.data,
+              cleaned_merged_interesting_remainder.data, bsks, ksks,
+              cleaned_merged_interesting_remainder.len,
+              mem_ptr->message_extract_lut_1);
+        };
+
+    // phase 2
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // new_remainder
+        // subtraction_overflowed
+        do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
+      }
+#pragma omp section
+      {
+        // at_least_one_upper_block_is_non_zero
+        check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes,
+                                   gpu_count);
+      }
+#pragma omp section
+      {
+        // cleaned_merged_interesting_remainder
+        create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
+                                                 gpu_indexes, gpu_count);
+      }
+    }
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+    }
+
+    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
+                  subtraction_overflowed.data,
+                  at_least_one_upper_block_is_non_zero.data,
+                  radix_params.big_lwe_dimension, 1);
+
+    int factor = (i) ? 3 : 2;
+    int factor_lut_id = factor - 2;
+    overflow_sum_radix.fill_with_same_ciphertext(
+        overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
+        streams[0], gpu_indexes[0]);
+
+    auto conditionally_zero_out_merged_interesting_remainder =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+              streams, gpu_indexes, gpu_count,
+              cleaned_merged_interesting_remainder.data,
+              cleaned_merged_interesting_remainder.data,
+              overflow_sum_radix.data, bsks, ksks,
+              cleaned_merged_interesting_remainder.len,
+              mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
+              factor);
+        };
+
+    auto conditionally_zero_out_merged_new_remainder =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+              streams, gpu_indexes, gpu_count, new_remainder.data,
+              new_remainder.data, overflow_sum_radix.data, bsks, ksks,
+              new_remainder.len,
+              mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
+        };
+
+    auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
+                                uint32_t gpu_count) {
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, did_not_overflow.data,
+          subtraction_overflowed.data,
+          at_least_one_upper_block_is_non_zero.data, bsks, ksks, 1,
+          mem_ptr->merge_overflow_flags_luts[pos_in_block],
+          mem_ptr->merge_overflow_flags_luts[pos_in_block]
+              ->params.message_modulus);
+
+      host_addition(streams[0], gpu_indexes[0],
+                    &quotient[block_of_bit * big_lwe_size],
+                    &quotient[block_of_bit * big_lwe_size],
+                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
+    };
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // cleaned_merged_interesting_remainder
+        conditionally_zero_out_merged_interesting_remainder(
+            mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
+      }
+#pragma omp section
+      {
+        // new_remainder
+        conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
+                                                    gpu_indexes, gpu_count);
+      }
+#pragma omp section
+      {
+        // quotient
+        set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
+      }
+    }
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+    }
+
+    assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
+    assert(first_trivial_block - 1 == new_remainder.len);
+
+    remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
+                         first_trivial_block - 1, streams[0], gpu_indexes[0]);
+    remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, streams[0],
+                         gpu_indexes[0]);
+  }
+
+  assert(remainder1.len == remainder2.len);
+
+  // Clean the quotient and remainder
+  // as even though they have no carries, they are not at nominal noise level
+  host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                remainder2.data, radix_params.big_lwe_dimension,
+                remainder1.len);
+
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }
+#pragma omp parallel sections
+  {
+#pragma omp section
+    {
+      integer_radix_apply_univariate_lookup_table_kb(
+          mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
+          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
+    }
+#pragma omp section
+    {
+      integer_radix_apply_univariate_lookup_table_kb(
+          mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient,
+          bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
+    }
+  }
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+  }
+}
+
+#endif // TFHE_RS_DIV_REM_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,119 +1,54 @@
 #include "integer/integer.cuh"
 #include <linear_algebra.h>

-void cuda_full_propagation_64_inplace(
-    cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
-    void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
-    uint32_t num_blocks) {
+void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
+                                      uint32_t gpu_count, void *input_blocks,
+                                      int8_t *mem_ptr, void **ksks, void **bsks,
+                                      uint32_t num_blocks) {

-  switch (polynomial_size) {
-  case 256:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 512:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 1024:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 2048:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 4096:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 8192:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 16384:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  default:
-    PANIC("Cuda error (full propagation inplace): unsupported polynomial size. "
-          "Supported N's are powers of two"
-          " in the interval [256..16384].")
-  }
+  int_fullprop_buffer<uint64_t> *buffer =
+      (int_fullprop_buffer<uint64_t> *)mem_ptr;
+
+  host_full_propagate_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(input_blocks), buffer, (uint64_t **)(ksks), bsks,
+      num_blocks);
 }

 void scratch_cuda_full_propagation_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory) {
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);

  scratch_cuda_full_propagation<uint64_t>(
-      stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
-      glwe_dimension, polynomial_size, level_count, grouping_factor,
-      input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
+      (cudaStream_t *)streams, gpu_indexes, gpu_count,
+      (int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
      allocate_gpu_memory);
 }

-void cleanup_cuda_full_propagation(cuda_stream_t *stream,
-                                   int8_t **mem_ptr_void) {
+void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
+                                   uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_fullprop_buffer<uint64_t> *mem_ptr =
      (int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);

-  cuda_drop_async(mem_ptr->lut_buffer, stream);
-  cuda_drop_async(mem_ptr->lut_indexes, stream);
-
-  cuda_drop_async(mem_ptr->lwe_indexes, stream);
-
-  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
-  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
-
-  switch (mem_ptr->pbs_type) {
-  case CLASSICAL: {
-    auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(mem_ptr->pbs_buffer);
-    x->release(stream);
-  } break;
-  case MULTI_BIT: {
-    auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
-    x->release(stream);
-  } break;
-  default:
-    PANIC("Cuda error (PBS): unsupported implementation variant.")
-  }
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

 void scratch_cuda_propagate_single_carry_kb_64_inplace(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -121,23 +56,67 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                          message_modulus, carry_modulus);

  scratch_cuda_propagate_single_carry_kb_inplace(
-      stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }

-void cuda_propagate_single_carry_kb_64_inplace(cuda_stream_t *stream,
-                                               void *lwe_array, int8_t *mem_ptr,
-                                               void *bsk, void *ksk,
-                                               uint32_t num_blocks) {
+void cuda_propagate_single_carry_kb_64_inplace(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *carry_out, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {
  host_propagate_single_carry<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array),
-      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks);
 }

-void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
+void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
+                                         uint32_t gpu_count,
                                         int8_t **mem_ptr_void) {
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void scratch_cuda_apply_univariate_lut_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_apply_univariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
+      num_radix_blocks, params, allocate_gpu_memory);
+}
+
+void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
+                                     uint32_t gpu_count, void *output_radix_lwe,
+                                     void *input_radix_lwe, int8_t *mem_ptr,
+                                     void **ksks, void **bsks,
+                                     uint32_t num_blocks) {
+
+  host_apply_univariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
+      num_blocks);
+}
+
+void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
+                                             uint32_t *gpu_indexes,
+                                             uint32_t gpu_count,
+                                             int8_t **mem_ptr_void) {
+  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -59,6 +59,40 @@ __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
  }
 }

+// rotate radix ciphertext right with specific value
+// calculation is not inplace, so `dst` and `src` must not be the same
+template <typename Torus>
+__host__ void
+host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
+                               uint32_t gpu_count, Torus *dst, Torus *src,
+                               uint32_t value, uint32_t blocks_count,
+                               uint32_t lwe_size) {
+  if (src == dst) {
+    PANIC("Cuda error (blocks_rotate_right): the source and destination "
+          "pointers should be different");
+  }
+  cudaSetDevice(gpu_indexes[0]);
+  radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
+      dst, src, value, blocks_count, lwe_size);
+}
+
+// rotate radix ciphertext left with specific value
+// calculation is not inplace, so `dst` and `src` must not be the same
+template <typename Torus>
+__host__ void
+host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
+                              uint32_t gpu_count, Torus *dst, Torus *src,
+                              uint32_t value, uint32_t blocks_count,
+                              uint32_t lwe_size) {
+  if (src == dst) {
+    PANIC("Cuda error (blocks_rotate_left): the source and destination "
+          "pointers should be different");
+  }
+  cudaSetDevice(gpu_indexes[0]);
+  radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
+      dst, src, value, blocks_count, lwe_size);
+}
+
 // polynomial_size threads
 template <typename Torus>
 __global__ void
@@ -82,18 +116,20 @@ device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
 *  becomes out = m1 * shift + m2
 */
 template <typename Torus>
-__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
+__host__ void pack_bivariate_blocks(cudaStream_t *streams,
+                                    uint32_t *gpu_indexes, uint32_t gpu_count,
+                                    Torus *lwe_array_out,
                                    Torus *lwe_indexes_out, Torus *lwe_array_1,
                                    Torus *lwe_array_2, Torus *lwe_indexes_in,
                                    uint32_t lwe_dimension, uint32_t shift,
                                    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);
  // Left message is shifted
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
+  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
      lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
      lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
@@ -101,9 +137,9 @@ __host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,

 template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
-  cudaSetDevice(stream->gpu_index);
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
  // apply_lookup_table
  auto params = lut->params;
  auto pbs_type = params.pbs_type;
@@ -117,27 +153,38 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;

-  // Compute Keyswitch-PBS
-  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_in,
-      lut->lwe_indexes_in, ksk, big_lwe_dimension, small_lwe_dimension,
-      ks_base_log, ks_level, num_radix_blocks);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  /// Apply KS to go from a big LWE dimension to a small LWE dimension
+  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
+                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
+                           lwe_array_in, lut->lwe_indexes_in, ksks,
+                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
+                           ks_level, num_radix_blocks, false);

-  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
-                     lut->lut_indexes, lut->tmp_lwe_after_ks,
-                     lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
-                     small_lwe_dimension, polynomial_size, pbs_base_log,
-                     pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  execute_pbs<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
+      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
+      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
+      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+      grouping_factor, num_radix_blocks, 1, 0,
+      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+
+  /// Synchronize all GPUs
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  for (uint i = 0; i < active_gpu_count; i++) {
+    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  }
 }

 template <typename Torus>
 __host__ void integer_radix_apply_bivariate_lookup_table_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
-    Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
-    int_radix_lut<Torus> *lut) {
-  cudaSetDevice(stream->gpu_index);
-  // apply_lookup_table_bivariate
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut,
+    uint32_t shift) {
+
  auto params = lut->params;
  auto pbs_type = params.pbs_type;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -149,27 +196,39 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
  auto grouping_factor = params.grouping_factor;
-  auto message_modulus = params.message_modulus;

  // Left message is shifted
  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
-  pack_bivariate_blocks(stream, lwe_array_pbs_in, lut->lwe_trivial_indexes,
-                        lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
-                        big_lwe_dimension, message_modulus, num_radix_blocks);
+  pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
+                        lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
+                        lut->lwe_indexes_in, big_lwe_dimension, shift,
+                        num_radix_blocks);
  check_cuda_error(cudaGetLastError());

-  // Apply LUT
-  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_pbs_in,
-      lut->lwe_trivial_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
-      ks_base_log, ks_level, num_radix_blocks);
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);

-  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
-                     lut->lut_indexes, lut->tmp_lwe_after_ks,
-                     lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
-                     small_lwe_dimension, polynomial_size, pbs_base_log,
-                     pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+  /// Apply KS to go from a big LWE dimension to a small LWE dimension
+  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
+                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
+                           lwe_array_pbs_in, lut->lwe_indexes_in, ksks,
+                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
+                           ks_level, num_radix_blocks, false);
+
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  execute_pbs<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
+      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
+      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
+      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+      grouping_factor, num_radix_blocks, 1, 0,
+      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
+
+  /// Synchronize all GPUs
+  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  for (uint i = 0; i < active_gpu_count; i++) {
+    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  }
 }

 // Rotates the slice in-place such that the first mid elements of the slice move
@@ -235,19 +294,38 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
                               message_modulus, carry_modulus, wrapped_f);
 }

+template <typename Torus>
+void generate_lookup_table_bivariate_with_factor(
+    Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus, Torus)> f, int factor) {
+
+  Torus factor_u64 = factor;
+  auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
+    Torus lhs = (input / factor_u64) % message_modulus;
+    Torus rhs = (input % factor_u64) % message_modulus;
+
+    return f(lhs, rhs);
+  };
+
+  generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
+                               message_modulus, carry_modulus, wrapped_f);
+}
+
 /*
 *  generate bivariate accumulator for device pointer
- *    v_stream - cuda stream
+ *    stream - cuda stream
 *    acc - device pointer for bivariate accumulator
 *    ...
 *    f - wrapping function with two Torus inputs
 */
 template <typename Torus>
 void generate_device_accumulator_bivariate(
-    cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus, Torus)> f) {
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {

+  cudaSetDevice(gpu_index);
  // host lut
  Torus *h_lut =
      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -256,30 +334,67 @@ void generate_device_accumulator_bivariate(
  generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
                                         message_modulus, carry_modulus, f);

-  // copy host lut and lut_indexes to device
-  cuda_memcpy_async_to_gpu(
-      acc_bivariate, h_lut,
-      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
+  // copy host lut and lut_indexes_vec to device
+  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
+                           (glwe_dimension + 1) * polynomial_size *
+                               sizeof(Torus),
+                           stream, gpu_index);

  // Release memory when possible
-  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
+  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
+                           h_lut);
 }

 /*
- *  generate bivariate accumulator for device pointer
+ *  generate bivariate accumulator with factor scaling for device pointer
+ *    v_stream - cuda stream
+ *    acc - device pointer for bivariate accumulator
+ *    ...
+ *    f - wrapping function with two Torus inputs
+ */
+template <typename Torus>
+void generate_device_accumulator_bivariate_with_factor(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
+    uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
+
+  cudaSetDevice(gpu_index);
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill bivariate accumulator
+  generate_lookup_table_bivariate_with_factor<Torus>(
+      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
+      factor);
+
+  // copy host lut and lut_indexes_vec to device
+  cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
+                           (glwe_dimension + 1) * polynomial_size *
+                               sizeof(Torus),
+                           stream, gpu_index);
+
+  // Release memory when possible
+  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
+                           h_lut);
+}
+
+/*
+ *  generate accumulator for device pointer
 *    v_stream - cuda stream
 *    acc - device pointer for accumulator
 *    ...
 *    f - evaluating function with one Torus input
 */
 template <typename Torus>
-void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
-                                 uint32_t glwe_dimension,
+void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
+                                 Torus *acc, uint32_t glwe_dimension,
                                 uint32_t polynomial_size,
                                 uint32_t message_modulus,
                                 uint32_t carry_modulus,
                                 std::function<Torus(Torus)> f) {

+  cudaSetDevice(gpu_index);
  // host lut
  Torus *h_lut =
      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -288,29 +403,33 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                               message_modulus, carry_modulus, f);

-  // copy host lut and lut_indexes to device
+  // copy host lut and lut_indexes_vec to device
  cuda_memcpy_async_to_gpu(
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
-      stream);
+      stream, gpu_index);

  // Release memory when possible
-  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
+  cuda_stream_add_callback(stream, gpu_index, host_free_on_stream_callback,
+                           h_lut);
 }

 template <typename Torus>
 void scratch_cuda_propagate_single_carry_kb_inplace(
-    cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_sc_prop_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

-  *mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
-                                           allocate_gpu_memory);
+  *mem_ptr =
+      new int_sc_prop_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                    num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
-void host_propagate_single_carry(cuda_stream_t *stream, Torus *lwe_array,
-                                 int_sc_prop_memory<Torus> *mem, void *bsk,
-                                 Torus *ksk, uint32_t num_blocks) {
+void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, Torus *lwe_array,
+                                 Torus *carry_out,
+                                 int_sc_prop_memory<Torus> *mem, void **bsks,
+                                 Torus **ksks, uint32_t num_blocks) {
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -325,15 +444,16 @@ void host_propagate_single_carry(cuda_stream_t *stream, Torus *lwe_array,
  auto message_acc = mem->message_acc;

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
-      luts_array);
+      streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks,
+      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele

  int num_steps = ceil(log2((double)num_blocks));
  int space = 1;
  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
-                               big_lwe_size_bytes * num_blocks, stream);
+                               big_lwe_size_bytes * num_blocks, streams[0],
+                               gpu_indexes[0]);

  for (int step = 0; step < num_steps; step++) {
    auto cur_blocks = &step_output[space * big_lwe_size];
@@ -341,31 +461,41 @@ void host_propagate_single_carry(cuda_stream_t *stream, Torus *lwe_array,
    int cur_total_blocks = num_blocks - space;

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
-        luts_carry_propagation_sum);
+        streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
+        bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
+        luts_carry_propagation_sum->params.message_modulus);

-    cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
-                                 cur_blocks,
-                                 big_lwe_size_bytes * cur_total_blocks, stream);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(
+        &generates_or_propagates[space * big_lwe_size], cur_blocks,
+        big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
    space *= 2;
  }

-  radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
-      step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
-  cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
+  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
+                                 generates_or_propagates, 1, num_blocks,
+                                 big_lwe_size);
+  if (carry_out != nullptr) {
+    cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
+                                 streams[0], gpu_indexes[0]);
+  }
+  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
+                    gpu_indexes[0]);

-  host_addition(stream, lwe_array, lwe_array, step_output,
+  host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
                glwe_dimension * polynomial_size, num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
+      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
+      num_blocks, message_acc);
 }

 template <typename Torus>
-void host_propagate_single_sub_borrow(cuda_stream_t *stream, Torus *overflowed,
-                                      Torus *lwe_array,
+void host_propagate_single_sub_borrow(cudaStream_t *streams,
+                                      uint32_t *gpu_indexes, uint32_t gpu_count,
+                                      Torus *overflowed, Torus *lwe_array,
                                      int_single_borrow_prop_memory<Torus> *mem,
-                                      void *bsk, Torus *ksk,
+                                      void **bsks, Torus **ksks,
                                      uint32_t num_blocks) {
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
@@ -381,14 +511,15 @@ void host_propagate_single_sub_borrow(cuda_stream_t *stream, Torus *overflowed,
  auto message_acc = mem->message_acc;

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
-      luts_array);
+      streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks,
+      ksks, num_blocks, luts_array);

  // compute prefix sum with hillis&steele
  int num_steps = ceil(log2((double)num_blocks));
  int space = 1;
  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
-                               big_lwe_size_bytes * num_blocks, stream);
+                               big_lwe_size_bytes * num_blocks, streams[0],
+                               gpu_indexes[0]);

  for (int step = 0; step < num_steps; step++) {
    auto cur_blocks = &step_output[space * big_lwe_size];
@@ -396,177 +527,104 @@ void host_propagate_single_sub_borrow(cuda_stream_t *stream, Torus *overflowed,
    int cur_total_blocks = num_blocks - space;

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
-        luts_carry_propagation_sum);
+        streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
+        bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
+        luts_carry_propagation_sum->params.message_modulus);

-    cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
-                                 cur_blocks,
-                                 big_lwe_size_bytes * cur_total_blocks, stream);
+    cuda_memcpy_async_gpu_to_gpu(
+        &generates_or_propagates[space * big_lwe_size], cur_blocks,
+        big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
    space *= 2;
  }

  cuda_memcpy_async_gpu_to_gpu(
      overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
-      big_lwe_size_bytes, stream);
+      big_lwe_size_bytes, streams[0], gpu_indexes[0]);

-  radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
-      step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
-  cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
+  host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
+                                 generates_or_propagates, 1, num_blocks,
+                                 big_lwe_size);
+  cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
+                    gpu_indexes[0]);

-  host_subtraction(stream, lwe_array, lwe_array, step_output,
-                   glwe_dimension * polynomial_size, num_blocks);
+  host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
+                   step_output, glwe_dimension * polynomial_size, num_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
+      streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
+      num_blocks, message_acc);
 }

 /*
 * input_blocks: input radix ciphertext propagation will happen inplace
 * acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
- * lut_indexes_message_carry: lut_indexes for message and carry, should always
- * be  {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
- * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
- *     size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
+ * lut_indexes_message_carry: lut_indexes_vec for message and carry, should
+ * always be  {0, 1} small_lwe_vector: output of keyswitch should have size = 2
+ * * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should
+ * have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
 */
-template <typename Torus, typename STorus, class params>
-void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
+template <typename Torus>
+void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, Torus *input_blocks,
                                 int_fullprop_buffer<Torus> *mem_ptr,
-                                 Torus *ksk, void *bsk, uint32_t lwe_dimension,
-                                 uint32_t glwe_dimension,
-                                 uint32_t polynomial_size, uint32_t ks_base_log,
-                                 uint32_t ks_level, uint32_t pbs_base_log,
-                                 uint32_t pbs_level, uint32_t grouping_factor,
+                                 Torus **ksks, void **bsks,
                                 uint32_t num_blocks) {
+  auto params = mem_ptr->lut->params;

-  int big_lwe_size = (glwe_dimension * polynomial_size + 1);
-  int small_lwe_size = (lwe_dimension + 1);
+  int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
+  int small_lwe_size = (params.small_lwe_dimension + 1);

  for (int i = 0; i < num_blocks; i++) {
    auto cur_input_block = &input_blocks[i * big_lwe_size];

+    cudaSetDevice(gpu_indexes[0]);
+    /// Since the keyswitch is done on one input only, use only 1 GPU
    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
-        stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
-        cur_input_block, mem_ptr->lwe_indexes, ksk,
-        polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
-        1);
+        streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
+        mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
+        mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension,
+        params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);

    cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
                                 mem_ptr->tmp_small_lwe_vector,
-                                 small_lwe_size * sizeof(Torus), stream);
+                                 small_lwe_size * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);

    execute_pbs<Torus>(
-        stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
-        mem_ptr->lut_buffer, mem_ptr->lut_indexes,
-        mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
-        mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
-        cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
+        streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
+        mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
+        mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
+        mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
+        params.glwe_dimension, params.small_lwe_dimension,
+        params.polynomial_size, params.pbs_base_log, params.pbs_level,
+        params.grouping_factor, 2, 2, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type);

    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
-                                 big_lwe_size * sizeof(Torus), stream);
+                                 big_lwe_size * sizeof(Torus), streams[0],
+                                 gpu_indexes[0]);

    if (i < num_blocks - 1) {
      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition(stream, next_input_block, next_input_block,
+      host_addition(streams[0], gpu_indexes[0], next_input_block,
+                    next_input_block,
                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                    glwe_dimension * polynomial_size, 1);
+                    params.big_lwe_dimension, 1);
    }
  }
 }

 template <typename Torus>
-void scratch_cuda_full_propagation(
-    cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
+void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                   uint32_t gpu_count,
+                                   int_fullprop_buffer<Torus> **mem_ptr,
+                                   int_radix_params params,
+                                   uint32_t num_radix_blocks,
+                                   bool allocate_gpu_memory) {

-  int8_t *pbs_buffer;
-  execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
-                             polynomial_size, pbs_level, grouping_factor,
-                             num_radix_blocks,
-                             cuda_get_max_shared_memory(stream->gpu_index),
-                             pbs_type, allocate_gpu_memory);
-
-  // LUT
-  Torus *lut_buffer;
-  if (allocate_gpu_memory) {
-    // LUT is used as a trivial encryption, so we only allocate memory for the
-    // body
-    Torus lut_buffer_size =
-        2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
-
-    lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
-
-    // LUTs
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    //
-    Torus *lut_buffer_message = lut_buffer;
-    Torus *lut_buffer_carry =
-        lut_buffer + (glwe_dimension + 1) * polynomial_size;
-
-    generate_device_accumulator<Torus>(
-        stream, lut_buffer_message, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, lut_f_message);
-
-    generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, lut_f_carry);
-  }
-
-  Torus *lut_indexes;
-  if (allocate_gpu_memory) {
-    lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
-
-    Torus h_lut_indexes[2] = {0, 1};
-    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
-                             stream);
-  }
-
-  Torus *lwe_indexes;
-  if (allocate_gpu_memory) {
-    Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
-
-    lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
-    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
-    for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
-    cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
-                             stream);
-    cuda_stream_add_callback(stream, host_free_on_stream_callback,
-                             h_lwe_indexes);
-  }
-
-  // Temporary arrays
-  Torus *small_lwe_vector;
-  Torus *big_lwe_vector;
-  if (allocate_gpu_memory) {
-    Torus small_vector_size = 2 * (lwe_dimension + 1) * sizeof(Torus);
-    Torus big_vector_size =
-        2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
-
-    small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
-    big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
-  }
-
-  *mem_ptr = new int_fullprop_buffer<Torus>;
-
-  (*mem_ptr)->pbs_type = pbs_type;
-  (*mem_ptr)->pbs_buffer = pbs_buffer;
-
-  (*mem_ptr)->lut_buffer = lut_buffer;
-  (*mem_ptr)->lut_indexes = lut_indexes;
-  (*mem_ptr)->lwe_indexes = lwe_indexes;
-
-  (*mem_ptr)->tmp_small_lwe_vector = small_lwe_vector;
-  (*mem_ptr)->tmp_big_lwe_vector = big_lwe_vector;
+  *mem_ptr =
+      new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
+                                     num_radix_blocks, allocate_gpu_memory);
 }

 // (lwe_dimension+1) threads
@@ -607,19 +665,16 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
 //
 // Expects the carry buffer to be empty
 template <typename Torus>
-__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
-                          Torus *lwe_array_in, uint32_t lwe_dimension,
-                          uint32_t num_radix_blocks, uint32_t factor) {
-  if (lwe_array_out == lwe_array_in)
-    PANIC("Cuda error in pack blocks: input and output pointers must be "
-          "different.");
-
-  cudaSetDevice(stream->gpu_index);
+__host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
+                          Torus *lwe_array_out, Torus *lwe_array_in,
+                          uint32_t lwe_dimension, uint32_t num_radix_blocks,
+                          uint32_t factor) {
+  cudaSetDevice(gpu_index);

  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
+  getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
+  device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
 }

@@ -639,14 +694,16 @@ device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,

 template <typename Torus>
 __host__ void
-create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
-                     Torus *scalar_array, uint32_t lwe_dimension,
-                     uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
-                     uint64_t message_modulus, uint64_t carry_modulus) {
+create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
+                     Torus *lwe_array_out, Torus *scalar_array,
+                     uint32_t lwe_dimension, uint32_t num_radix_blocks,
+                     uint32_t num_scalar_blocks, uint64_t message_modulus,
+                     uint64_t carry_modulus) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
-  cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
+  cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream,
+                    gpu_index);

  if (num_scalar_blocks == 0)
    return;
@@ -663,7 +720,7 @@ create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
+  device_create_trivial_radix<<<grid, thds, 0, stream>>>(
      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }
@@ -674,23 +731,26 @@ create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
 * * (lwe_dimension+1) * sizeeof(Torus) bytes
 */
 template <typename Torus>
-__host__ void extract_n_bits(cuda_stream_t *stream, Torus *lwe_array_out,
-                             Torus *lwe_array_in, void *bsk, Torus *ksk,
+__host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count, Torus *lwe_array_out,
+                             Torus *lwe_array_in, void **bsks, Torus **ksks,
                             uint32_t num_radix_blocks, uint32_t bits_per_block,
                             int_bit_extract_luts_buffer<Torus> *bit_extract) {

  integer_radix_apply_univariate_lookup_table_kb(
-      stream, lwe_array_out, lwe_array_in, bsk, ksk,
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
      num_radix_blocks * bits_per_block, bit_extract->lut);
 }

 template <typename Torus>
-__host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
-                           Torus *signs_array_in,
-                           int_comparison_buffer<Torus> *mem_ptr,
-                           std::function<Torus(Torus)> sign_handler_f,
-                           void *bsk, Torus *ksk, uint32_t num_sign_blocks) {
+__host__ void
+reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+             Torus *signs_array_out, Torus *signs_array_in,
+             int_comparison_buffer<Torus> *mem_ptr,
+             std::function<Torus(Torus)> sign_handler_f, void **bsks,
+             Torus **ksks, uint32_t num_sign_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
@@ -711,20 +771,24 @@ __host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
  auto signs_a = diff_buffer->tmp_signs_a;
  auto signs_b = diff_buffer->tmp_signs_b;

-  cuda_memcpy_async_gpu_to_gpu(
-      signs_a, signs_array_in,
-      (big_lwe_dimension + 1) * num_sign_blocks * sizeof(Torus), stream);
+  cuda_memcpy_async_gpu_to_gpu(signs_a, signs_array_in,
+                               (big_lwe_dimension + 1) * num_sign_blocks *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
  if (num_sign_blocks > 2) {
    auto lut = diff_buffer->reduce_signs_lut;
    generate_device_accumulator<Torus>(
-        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-        carry_modulus, reduce_two_orderings_function);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        reduce_two_orderings_function);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    while (num_sign_blocks > 2) {
-      pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, num_sign_blocks,
-                  4);
+      pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
+                  big_lwe_dimension, num_sign_blocks, 4);
      integer_radix_apply_univariate_lookup_table_kb(
-          stream, signs_a, signs_b, bsk, ksk, num_sign_blocks / 2, lut);
+          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
+          num_sign_blocks / 2, lut);

      auto last_block_signs_b =
          signs_b + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
@@ -733,7 +797,7 @@ __host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
      if (num_sign_blocks % 2 == 1)
        cuda_memcpy_async_gpu_to_gpu(last_block_signs_a, last_block_signs_b,
                                     (big_lwe_dimension + 1) * sizeof(Torus),
-                                     stream);
+                                     streams[0], gpu_indexes[0]);

      num_sign_blocks = (num_sign_blocks / 2) + (num_sign_blocks % 2);
    }
@@ -747,13 +811,17 @@ __host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
    };

    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, final_lut_f);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        final_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, 2, 4);
-    integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
-                                                   signs_b, bsk, ksk, 1, lut);
+    pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
+                2, 4);
+    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
+                                                   gpu_count, signs_array_out,
+                                                   signs_b, bsks, ksks, 1, lut);

  } else {

@@ -763,12 +831,45 @@ __host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
    };

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
-    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, final_lut_f);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        final_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
-                                                   signs_a, bsk, ksk, 1, lut);
+    integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
+                                                   gpu_count, signs_array_out,
+                                                   signs_a, bsks, ksks, 1, lut);
  }
 }
+
+template <typename Torus>
+void scratch_cuda_apply_univariate_lut_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_radix_lut<Torus> **mem_ptr, Torus *input_lut, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
+                                      1, num_radix_blocks, allocate_gpu_memory);
+  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
+  // 0
+  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(gpu_indexes[0], 0), input_lut,
+                           (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(Torus),
+                           streams[0], gpu_indexes[0]);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+}
+
+template <typename Torus>
+void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                  uint32_t gpu_count, Torus *radix_lwe_out,
+                                  Torus *radix_lwe_in,
+                                  int_radix_lut<Torus> *mem, Torus **ksks,
+                                  void **bsks, uint32_t num_blocks) {
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
+      num_blocks, mem);
+}
+
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -66,12 +66,12 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
 * the integer radix multiplication in keyswitch->bootstrap order.
 */
 void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type,
+    uint32_t max_shared_memory, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          polynomial_size * glwe_dimension, lwe_dimension,
@@ -79,14 +79,21 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
                          grouping_factor, message_modulus, carry_modulus);

  switch (polynomial_size) {
+  case 256:
+  case 512:
+  case 1024:
  case 2048:
+  case 4096:
+  case 8192:
+  case 16384:
    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
-        stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Only N = 2048 is supported")
+          "Supported N's are powers of two in the interval [256..16384].")
  }
 }

@@ -119,58 +126,106 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
 * - 'max_shared_memory' maximum shared memory per cuda block
 */
 void cuda_integer_mult_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
-    void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
-    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
-    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
-    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
-    uint32_t max_shared_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
+    void **bsks, void **ksks, int8_t *mem_ptr, uint32_t polynomial_size,
+    uint32_t num_blocks) {

  switch (polynomial_size) {
+  case 256:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 512:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 1024:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
  case 2048:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 4096:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 8192:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+    break;
+  case 16384:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Only N = 2048 is supported")
+          "Supported N's are powers of two in the interval [256..16384].")
  }
 }

-void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
+                               uint32_t gpu_count, int8_t **mem_ptr_void) {

  int_mul_memory<uint64_t> *mem_ptr =
      (int_mul_memory<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

 void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_in_radix,
-    uint32_t max_num_radix_in_vec, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
                          grouping_factor, message_modulus, carry_modulus);
  scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
-      stream, (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr,
-      num_blocks_in_radix, max_num_radix_in_vec, params, allocate_gpu_memory);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
+      max_num_radix_in_vec, params, allocate_gpu_memory);
 }

 void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
-    cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_vec,
-    uint32_t num_radix_in_vec, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t num_blocks_in_radix) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
+    int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {

  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;

@@ -184,58 +239,60 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
  switch (mem->params.polynomial_size) {
  case 512:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 1024:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 2048:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 4096:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 8192:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  case 16384:
    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
+        (uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
    break;
  default:
-    PANIC("Cuda error (integer sum ciphertexts): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
+    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
+          "Supported N's are powers of two in the interval [256..16384].")
  }

  free(terms_degree);
 }

-void cleanup_cuda_integer_radix_sum_ciphertexts_vec(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
+                                                    uint32_t *gpu_indexes,
+                                                    uint32_t gpu_count,
                                                    int8_t **mem_ptr_void) {
  int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
      (int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -91,12 +91,15 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-template <typename Torus>
+template <typename Torus, sharedMemDegree SMD>
 __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
                                uint32_t chunk_size, uint32_t block_size,
                                uint32_t num_blocks) {

-  extern __shared__ Torus result[];
+  extern __shared__ int8_t sharedmem[];
+
+  Torus *result = (Torus *)sharedmem;
+
  size_t stride = blockDim.x;
  size_t chunk_id = blockIdx.x;
  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
@@ -106,6 +109,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
  size_t block_stride = blockIdx.y * block_size;
  auto dst_block = &dst_radix[block_stride];

+  if constexpr (SMD == NOSM)
+    result = dst_block;
+
  // init shared mem with first radix of chunk
  size_t tid = threadIdx.x;
  for (int i = tid; i < block_size; i += stride) {
@@ -121,9 +127,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
  }

  // put result from shared mem to global mem
-  for (int i = tid; i < block_size; i += stride) {
-    dst_block[i] = result[i];
-  }
+  if constexpr (SMD == FULLSM)
+    for (int i = tid; i < block_size; i += stride)
+      dst_block[i] = result[i];
 }

 template <typename Torus, class params>
@@ -175,39 +181,42 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
 }
 template <typename Torus>
 __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
-    cuda_stream_t *stream, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
+  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
+    check_cuda_error(cudaFuncSetAttribute(
+        tree_add_chunks<Torus, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
+                           cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(
+        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
+    check_cuda_error(cudaGetLastError());
+  }
  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
-      stream, params, num_blocks_in_radix, max_num_radix_in_vec,
-      allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
+      max_num_radix_in_vec, allocate_gpu_memory);
 }

 template <typename Torus, class params>
 __host__ void host_integer_sum_ciphertexts_vec_kb(
-    cuda_stream_t *stream, Torus *radix_lwe_out, Torus *terms,
-    int *terms_degree, void *bsk, uint64_t *ksk,
-    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
+    uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {

-  cudaSetDevice(stream->gpu_index);
  auto new_blocks = mem_ptr->new_blocks;
  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;

-  auto luts_message_carry = mem_ptr->luts_message_carry;
-
-  auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
-  auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
-
  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;

@@ -224,7 +233,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
                                 num_blocks_in_radix * num_radix_in_vec *
                                     big_lwe_size * sizeof(Torus),
-                                 stream);
+                                 streams[0], gpu_indexes[0]);
  }

  size_t r = num_radix_in_vec;
@@ -237,7 +246,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
  int32_t h_smart_copy_in[r * num_blocks];
  int32_t h_smart_copy_out[r * num_blocks];

-  auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]);

  while (r > 2) {
    size_t cur_total_blocks = r * num_blocks;
@@ -247,8 +256,15 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    dim3 add_grid(ch_amount, num_blocks, 1);
    size_t sm_size = big_lwe_size * sizeof(Torus);

-    tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
-        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
+    cudaSetDevice(gpu_indexes[0]);
+    if (sm_size < max_shared_memory)
+      tree_add_chunks<Torus, FULLSM><<<add_grid, 512, sm_size, streams[0]>>>(
+          new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
+    else
+      tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, streams[0]>>>(
+          new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
+
+    check_cuda_error(cudaGetLastError());

    size_t total_count = 0;
    size_t message_count = 0;
@@ -260,36 +276,80 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
        total_count, message_count, carry_count, sm_copy_count);

+    // create lut object for message and carry
+    // we allocate luts_message_carry in the host function (instead of scratch)
+    // to reduce average memory consumption
+    auto luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2, total_count, true);
+
+    auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
+    auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);
+
+    // define functions for each accumulator
+    auto lut_f_message = [message_modulus](Torus x) -> Torus {
+      return x % message_modulus;
+    };
+    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+      return x / message_modulus;
+    };
+
+    // generate accumulators
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], message_acc, glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus, lut_f_message);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, lut_f_carry);
+
+    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
+    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
+
    size_t copy_size = total_count * sizeof(Torus);
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, stream);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, stream);
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
+                             streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
+                             streams[0], gpu_indexes[0]);
    copy_size = sm_copy_count * sizeof(int32_t);
    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
-                             stream);
+                             streams[0], gpu_indexes[0]);
    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
-                             stream);
+                             streams[0], gpu_indexes[0]);

-    smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
+    smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
+    check_cuda_error(cudaGetLastError());

    if (carry_count > 0)
      cuda_set_value_async<Torus>(
-          &(stream->stream), luts_message_carry->get_lut_indexes(message_count),
-          1, carry_count);
+          streams[0], gpu_indexes[0],
+          luts_message_carry->get_lut_indexes(gpu_indexes[0], message_count), 1,
+          carry_count);

-    cuda_keyswitch_lwe_ciphertext_vector(
-        stream, small_lwe_vector, lwe_indexes_in, new_blocks, lwe_indexes_in,
-        ksk, polynomial_size * glwe_dimension, lwe_dimension,
-        mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

-    execute_pbs<Torus>(
-        stream, new_blocks, lwe_indexes_out, luts_message_carry->lut,
-        luts_message_carry->lut_indexes, small_lwe_vector, lwe_indexes_in, bsk,
-        luts_message_carry->buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, mem_ptr->params.pbs_base_log,
-        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count,
-        2, 0, max_shared_memory, mem_ptr->params.pbs_type);
+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    /// After this keyswitch execution, we need to synchronize the streams
+    /// because the keyswitch and PBS do not operate on the same number of
+    /// inputs
+    execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
+                             lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
+                             polynomial_size * glwe_dimension, lwe_dimension,
+                             mem_ptr->params.ks_base_log,
+                             mem_ptr->params.ks_level, message_count, true);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
+                       lwe_indexes_out, luts_message_carry->lut_vec,
+                       luts_message_carry->lut_indexes_vec, small_lwe_vector,
+                       lwe_indexes_in, bsks, luts_message_carry->buffer,
+                       glwe_dimension, lwe_dimension, polynomial_size,
+                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+                       mem_ptr->params.grouping_factor, total_count, 2, 0,
+                       max_shared_memory, mem_ptr->params.pbs_type, true);
+    luts_message_carry->release(streams, gpu_indexes, gpu_count);

    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
    int new_blocks_created = 2 * ch_amount * num_blocks;
@@ -297,26 +357,28 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(

    auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
    auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
-    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
+    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, streams[0],
+                                 gpu_indexes[0]);
    std::swap(new_blocks, old_blocks);
    r = (new_blocks_created + rem_blocks) / num_blocks;
  }

-  host_addition(stream, radix_lwe_out, old_blocks,
+  host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
                num_blocks);

-  host_propagate_single_carry<Torus>(stream, radix_lwe_out, mem_ptr->scp_mem,
-                                     bsk, ksk, num_blocks);
+  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count,
+                                     radix_lwe_out, nullptr, mem_ptr->scp_mem,
+                                     bsks, ksks, num_blocks);
 }

 template <typename Torus, typename STorus, class params>
 __host__ void host_integer_mult_radix_kb(
-    cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
-    uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
+    uint64_t *radix_lwe_right, void **bsks, uint64_t **ksks,
    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
@@ -383,24 +445,28 @@ __host__ void host_integer_mult_radix_kb(
  dim3 grid(lsb_vector_block_count, 1, 1);
  dim3 thds(params::degree / params::opt, 1, 1);

-  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
+  cudaSetDevice(gpu_indexes[0]);
+  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
      vector_lsb_rhs, vector_msb_rhs, num_blocks);
+  check_cuda_error(cudaGetLastError());

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
-      total_block_count, luts_array);
+      streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
+      vector_result_sb, bsks, ksks, total_block_count, luts_array,
+      luts_array->params.message_modulus);

  vector_result_lsb = &block_mul_res[0];
  vector_result_msb = &block_mul_res[lsb_vector_block_count *
                                     (polynomial_size * glwe_dimension + 1)];

+  cudaSetDevice(gpu_indexes[0]);
  fill_radix_from_lsb_msb<Torus, params>
      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
-         stream->stream>>>(vector_result_sb, vector_result_lsb,
-                           vector_result_msb, glwe_dimension,
-                           lsb_vector_block_count, msb_vector_block_count,
-                           num_blocks);
+         streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
+                       glwe_dimension, lsb_vector_block_count,
+                       msb_vector_block_count, num_blocks);
+  check_cuda_error(cudaGetLastError());

  int terms_degree[2 * num_blocks * num_blocks];
  for (int i = 0; i < num_blocks * num_blocks; i++) {
@@ -416,25 +482,34 @@ __host__ void host_integer_mult_radix_kb(
  }

  host_integer_sum_ciphertexts_vec_kb<Torus, params>(
-      stream, radix_lwe_out, vector_result_sb, terms_degree, bsk, ksk,
-      mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
+      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb,
+      terms_degree, bsks, ksks, mem_ptr->sum_ciphertexts_mem, num_blocks,
+      2 * num_blocks);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
-    cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(stream->gpu_index);
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
+  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
+    check_cuda_error(cudaFuncSetAttribute(
+        tree_add_chunks<Torus, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
+                           cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(
+        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
+    check_cuda_error(cudaGetLastError());
+  }

-  *mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
-                                       allocate_gpu_memory);
+  *mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
+                                       num_radix_blocks, allocate_gpu_memory);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -1,23 +1,23 @@
 #include "integer/negation.cuh"

 void cuda_negate_integer_radix_ciphertext_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus) {

-  host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
-                              static_cast<uint64_t *>(lwe_array), lwe_dimension,
-                              lwe_ciphertext_count, message_modulus,
-                              carry_modulus);
+  host_integer_radix_negation(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
+      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
 }

 void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -25,65 +25,73 @@ void scratch_cuda_integer_radix_overflowing_sub_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
-      stream, (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks,
-      params, allocate_gpu_memory);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      allocate_gpu_memory);
 }

 void cuda_integer_radix_overflowing_sub_kb_64(
-    cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_overflowed,
-    void *radix_lwe_left, void *radix_lwe_right, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *radix_lwe_out, void *radix_lwe_overflowed, void *radix_lwe_left,
+    void *radix_lwe_right, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {

  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;

  switch (mem->params.polynomial_size) {
  case 512:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 1024:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 2048:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 4096:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 8192:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  case 16384:
    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(radix_lwe_out),
        static_cast<uint64_t *>(radix_lwe_overflowed),
        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
+        static_cast<uint64_t *>(radix_lwe_right), bsks, (uint64_t **)(ksks),
+        mem, num_blocks);
    break;
  default:
    PANIC("Cuda error (integer overflowing sub): unsupported polynomial size. "
@@ -91,10 +99,12 @@ void cuda_integer_radix_overflowing_sub_kb_64(
  }
 }

-void cleanup_cuda_integer_radix_overflowing_sub(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_overflowing_sub(void **streams,
+                                                uint32_t *gpu_indexes,
+                                                uint32_t gpu_count,
                                                int8_t **mem_ptr_void) {
  int_overflowing_sub_memory<uint64_t> *mem_ptr =
      (int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -58,12 +58,13 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
 }

 template <typename Torus>
-__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
-                                          Torus *input, uint32_t lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count,
-                                          uint64_t message_modulus,
-                                          uint64_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
+__host__ void
+host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
+                            uint32_t gpu_count, Torus *output, Torus *input,
+                            uint32_t lwe_dimension,
+                            uint32_t input_lwe_ciphertext_count,
+                            uint64_t message_modulus, uint64_t carry_modulus) {
+  cudaSetDevice(gpu_indexes[0]);

  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
@@ -81,7 +82,7 @@ __host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
+  device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
      output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
      carry_modulus, delta);
  check_cuda_error(cudaGetLastError());
@@ -89,30 +90,32 @@ __host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,

 template <typename Torus>
 __host__ void scratch_cuda_integer_overflowing_sub_kb(
-    cuda_stream_t *stream, int_overflowing_sub_memory<Torus> **mem_ptr,
-    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_overflowing_sub_memory<Torus> **mem_ptr, uint32_t num_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_overflowing_sub_memory<Torus>(stream, params, num_blocks,
-                                                   allocate_gpu_memory);
+  *mem_ptr = new int_overflowing_sub_memory<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory);
 }

 template <typename Torus, class params>
 __host__ void host_integer_overflowing_sub_kb(
-    cuda_stream_t *stream, Torus *radix_lwe_out, Torus *radix_lwe_overflowed,
-    Torus *radix_lwe_left, Torus *radix_lwe_right, void *bsk, uint64_t *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *radix_lwe_out, Torus *radix_lwe_overflowed, Torus *radix_lwe_left,
+    Torus *radix_lwe_right, void **bsks, uint64_t **ksks,
    int_overflowing_sub_memory<uint64_t> *mem_ptr, uint32_t num_blocks) {

  auto radix_params = mem_ptr->params;

  host_unchecked_sub_with_correcting_term(
-      stream, radix_lwe_out, radix_lwe_left, radix_lwe_right,
-      radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
-      radix_params.carry_modulus, radix_params.message_modulus - 1);
+      streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
+      radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
+      radix_params.message_modulus, radix_params.carry_modulus,
+      radix_params.message_modulus - 1);

  host_propagate_single_sub_borrow<Torus>(
-      stream, radix_lwe_overflowed, radix_lwe_out, mem_ptr->borrow_prop_mem,
-      bsk, ksk, num_blocks);
+      streams, gpu_indexes, gpu_count, radix_lwe_overflowed, radix_lwe_out,
+      mem_ptr->borrow_prop_mem, bsks, ksks, num_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -1,12 +1,12 @@
 #include "integer/scalar_addition.cuh"

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, void *scalar_input,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {

  host_integer_radix_scalar_addition_inplace(
-      stream, static_cast<uint64_t *>(lwe_array),
-      static_cast<uint64_t *>(scalar_input), lwe_dimension,
-      lwe_ciphertext_count, message_modulus, carry_modulus);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
+      lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -27,10 +27,11 @@ __global__ void device_integer_radix_scalar_addition_inplace(

 template <typename Torus>
 __host__ void host_integer_radix_scalar_addition_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
-    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
+    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -44,8 +45,7 @@ __host__ void host_integer_radix_scalar_addition_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
-                                                 stream->stream>>>(
+  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
      delta);
  check_cuda_error(cudaGetLastError());
@@ -65,10 +65,11 @@ __global__ void device_integer_radix_add_scalar_one_inplace(

 template <typename Torus>
 __host__ void host_integer_radix_add_scalar_one_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array, uint32_t lwe_dimension,
    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -82,8 +83,7 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
  // this
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

-  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
-                                                stream->stream>>>(
+  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
      lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
 }
@@ -104,10 +104,11 @@ __global__ void device_integer_radix_scalar_subtraction_inplace(

 template <typename Torus>
 __host__ void host_integer_radix_scalar_subtraction_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
-    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array, Torus *scalar_input, uint32_t lwe_dimension,
+    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -122,7 +123,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

  device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
-                                                    stream->stream>>>(
+                                                    streams[0]>>>(
      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
      delta);
  check_cuda_error(cudaGetLastError());
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -1,14 +1,16 @@
 #include "integer/scalar_bitops.cuh"

 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
-    void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_input, void *clear_blocks,
+    uint32_t num_clear_blocks, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t lwe_ciphertext_count, BITOP_TYPE op) {

  host_integer_radix_scalar_bitop_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_input),
      static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
      lwe_ciphertext_count, op);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -6,12 +6,11 @@

 template <typename Torus>
 __host__ void host_integer_radix_scalar_bitop_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_input,
-    Torus *clear_blocks, uint32_t num_clear_blocks,
-    int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
-    uint32_t num_radix_blocks, BITOP_TYPE op) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_input, Torus *clear_blocks,
+    uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr, void **bsks,
+    Torus **ksks, uint32_t num_radix_blocks, BITOP_TYPE op) {

-  cudaSetDevice(stream->gpu_index);
  auto lut = mem_ptr->lut;
  auto params = lut->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -21,28 +20,31 @@ __host__ void host_integer_radix_scalar_bitop_kb(
  if (num_clear_blocks == 0) {
    if (op == SCALAR_BITAND) {
      cuda_memset_async(lwe_array_out, 0,
-                        num_radix_blocks * lwe_size * sizeof(Torus), stream);
+                        num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
+                        gpu_indexes[0]);
    } else {
      cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
                                   num_radix_blocks * lwe_size * sizeof(Torus),
-                                   stream);
+                                   streams[0], gpu_indexes[0]);
    }
  } else {
    // We have all possible LUTs pre-computed and we use the decomposed scalar
    // as index to recover the right one
-    cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
-                                 num_clear_blocks * sizeof(Torus), stream);
+    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(gpu_indexes[0], 0),
+                                 clear_blocks, num_clear_blocks * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
-        lut);
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
+        ksks, num_clear_blocks, lut);

    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
      cuda_memset_async(lwe_array_out_block, 0,
                        (num_radix_blocks - num_clear_blocks) * lwe_size *
                            sizeof(Torus),
-                        stream);
+                        streams[0], gpu_indexes[0]);
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -1,9 +1,10 @@
 #include "integer/scalar_comparison.cuh"

 void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    void *scalar_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, void *scalar_blocks,
+    int8_t *mem_ptr, void **bsks, void **ksks, uint32_t lwe_ciphertext_count,
+    uint32_t num_scalar_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -11,29 +12,32 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
  case EQ:
  case NE:
    host_integer_radix_scalar_equality_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
+        static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
+        (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
    break;
  case GT:
  case GE:
  case LT:
  case LE:
    host_integer_radix_scalar_difference_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(scalar_blocks), buffer,
-        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
+        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
        lwe_ciphertext_count, num_scalar_blocks);
    break;
  case MAX:
  case MIN:
    host_integer_radix_scalar_maxmin_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
+        static_cast<uint64_t *>(scalar_blocks), buffer, bsks,
+        (uint64_t **)(ksks), lwe_ciphertext_count, num_scalar_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -6,12 +6,12 @@

 template <typename Torus>
 __host__ void integer_radix_unsigned_scalar_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -46,9 +46,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
  if (total_num_scalar_blocks == 0) {
    // We only have to compare blocks with zero
    // means scalar is zero
-    host_compare_with_zero_equality(
-        stream, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsk, ksk,
-        total_num_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
+                                    mem_ptr->tmp_lwe_array_out, lwe_array_in,
+                                    mem_ptr, bsks, ksks, total_num_radix_blocks,
+                                    mem_ptr->is_zero_lut);

    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -57,12 +58,15 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, scalar_last_leaf_lut_f);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        scalar_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
+        streams, gpu_indexes, gpu_count, lwe_array_out,
+        mem_ptr->tmp_lwe_array_out, bsks, ksks, 1, lut);

  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
    // We have to handle both part of the work described above
@@ -76,9 +80,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;

-    cuda_synchronize_stream(stream);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
+    auto lsb_streams = mem_ptr->lsb_streams;
+    auto msb_streams = mem_ptr->msb_streams;
+
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }

 #pragma omp parallel sections
    {
@@ -90,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        Torus *lhs = diff_buffer->tmp_packed_left;
        Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
-                    num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
-                    message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                    total_num_scalar_blocks, message_modulus);

        // From this point we have half number of blocks
        num_lsb_radix_blocks /= 2;
@@ -105,28 +112,31 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        // - 2 if lhs > rhs

        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
-                                       mem_ptr, bsk, ksk, num_lsb_radix_blocks);
+        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                       comparisons, lhs, rhs, mem_ptr, bsks,
+                                       ksks, num_lsb_radix_blocks);

        // Reduces a vec containing radix blocks that encrypts a sign
        // (inferior, equal, superior) to one single radix block containing the
        // final sign
-        tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
-                            mem_ptr->diff_buffer->tree_buffer,
-                            mem_ptr->identity_lut_f, bsk, ksk,
-                            num_lsb_radix_blocks);
+        tree_sign_reduction(
+            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+            mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
+            ksks, num_lsb_radix_blocks);
      }
 #pragma omp section
      {
        //////////////
        // msb
-        host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        mem_ptr->is_zero_lut);
+        host_compare_with_zero_equality(
+            msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb,
+            mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
      }
    }
-    cuda_synchronize_stream(lsb_stream);
-    cuda_synchronize_stream(msb_stream);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+    }

    //////////////
    // Reduce the two blocks into one final
@@ -141,12 +151,14 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-        carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_bivariate_lookup_table_kb(
-        stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
-        1, lut);
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
+        lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);

  } else {
    // We only have to do the regular comparison
@@ -158,10 +170,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

-    pack_blocks(stream, lhs, lwe_array_in, big_lwe_dimension,
-                num_lsb_radix_blocks, message_modulus);
-    pack_blocks(stream, rhs, scalar_blocks, 0, num_scalar_blocks,
-                message_modulus);
+    pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
+    pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                num_scalar_blocks, message_modulus);

    // From this point we have half number of blocks
    num_lsb_radix_blocks /= 2;
@@ -172,26 +184,28 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // - 1 if lhs == rhs
    // - 2 if lhs > rhs
    auto comparisons = mem_ptr->tmp_lwe_array_out;
-    scalar_compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk,
-                                   ksk, num_lsb_radix_blocks);
+    scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
+                                   lhs, rhs, mem_ptr, bsks, ksks,
+                                   num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    tree_sign_reduction(stream, lwe_array_out, comparisons,
-                        mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
-                        ksk, num_lsb_radix_blocks);
+    tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
+                        comparisons, mem_ptr->diff_buffer->tree_buffer,
+                        sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
  }
 }

 template <typename Torus>
 __host__ void integer_radix_signed_scalar_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -227,9 +241,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // We only have to compare blocks with zero
    // means scalar is zero
    Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
-    host_compare_with_zero_equality(stream, are_all_msb_zeros, lwe_array_in,
-                                    mem_ptr, bsk, ksk, total_num_radix_blocks,
-                                    mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality(
+        streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
+        mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
    Torus *sign_block =
        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;

@@ -270,11 +284,14 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-        carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_bivariate_lookup_table_kb(
-        stream, lwe_array_out, are_all_msb_zeros, sign_block, bsk, ksk, 1, lut);
+        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
+        sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);

  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
    // We have to handle both part of the work described above
@@ -288,9 +305,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;

-    cuda_synchronize_stream(stream);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
+    auto lsb_streams = mem_ptr->lsb_streams;
+    auto msb_streams = mem_ptr->msb_streams;
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }

 #pragma omp parallel sections
    {
@@ -302,10 +321,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        Torus *lhs = diff_buffer->tmp_packed_left;
        Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
-                    num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
-                    message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                    total_num_scalar_blocks, message_modulus);

        // From this point we have half number of blocks
        num_lsb_radix_blocks /= 2;
@@ -317,16 +336,17 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        // - 2 if lhs > rhs

        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
-                                       mem_ptr, bsk, ksk, num_lsb_radix_blocks);
+        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                       comparisons, lhs, rhs, mem_ptr, bsks,
+                                       ksks, num_lsb_radix_blocks);

        // Reduces a vec containing radix blocks that encrypts a sign
        // (inferior, equal, superior) to one single radix block containing the
        // final sign
-        tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
-                            mem_ptr->diff_buffer->tree_buffer,
-                            mem_ptr->identity_lut_f, bsk, ksk,
-                            num_lsb_radix_blocks);
+        tree_sign_reduction(
+            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
+            mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks,
+            ksks, num_lsb_radix_blocks);
      }
 #pragma omp section
      {
@@ -334,9 +354,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        // msb
        // We remove the last block (which is the sign)
        Torus *are_all_msb_zeros = lwe_array_msb_out;
-        host_compare_with_zero_equality(msb_stream, are_all_msb_zeros, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        mem_ptr->is_zero_lut);
+        host_compare_with_zero_equality(
+            msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb,
+            mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);

        auto sign_bit_pos = (int)log2(message_modulus) - 1;

@@ -364,23 +384,28 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

        auto signed_msb_lut = mem_ptr->signed_msb_lut;
        generate_device_accumulator_bivariate<Torus>(
-            msb_stream, signed_msb_lut->lut, params.glwe_dimension,
+            msb_streams[0], gpu_indexes[0],
+            signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
            params.polynomial_size, params.message_modulus,
            params.carry_modulus, lut_f);
+        signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

        Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
        integer_radix_apply_bivariate_lookup_table_kb(
-            msb_stream, lwe_array_msb_out, sign_block, are_all_msb_zeros, bsk,
-            ksk, 1, signed_msb_lut);
+            msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
+            are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
+            signed_msb_lut->params.message_modulus);
      }
    }
-    cuda_synchronize_stream(lsb_stream);
-    cuda_synchronize_stream(msb_stream);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+    }

    //////////////
    // Reduce the two blocks into one final
-    reduce_signs(stream, lwe_array_out, lwe_array_lsb_out, mem_ptr,
-                 sign_handler_f, bsk, ksk, 2);
+    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
+                 lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);

  } else {
    // We only have to do the regular comparison
@@ -388,9 +413,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    // total_num_radix_blocks == total_num_scalar_blocks
    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;

-    cuda_synchronize_stream(stream);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
+    auto lsb_streams = mem_ptr->lsb_streams;
+    auto msb_streams = mem_ptr->msb_streams;

    auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_sign_out =
@@ -403,10 +430,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        Torus *lhs = diff_buffer->tmp_packed_left;
        Torus *rhs = diff_buffer->tmp_packed_right;

-        pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
-                    num_lsb_radix_blocks - 1, message_modulus);
-        pack_blocks(lsb_stream, rhs, scalar_blocks, 0, num_lsb_radix_blocks - 1,
+        pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
+                    big_lwe_dimension, num_lsb_radix_blocks - 1,
                    message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
+                    num_lsb_radix_blocks - 1, message_modulus);

        // From this point we have half number of blocks
        num_lsb_radix_blocks /= 2;
@@ -415,8 +443,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        // - 0 if lhs < rhs
        // - 1 if lhs == rhs
        // - 2 if lhs > rhs
-        scalar_compare_radix_blocks_kb(lsb_stream, lwe_array_ct_out, lhs, rhs,
-                                       mem_ptr, bsk, ksk, num_lsb_radix_blocks);
+        scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
+                                       lwe_array_ct_out, lhs, rhs, mem_ptr,
+                                       bsks, ksks, num_lsb_radix_blocks);
      }
 #pragma omp section
      {
@@ -426,34 +455,38 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
            scalar_blocks + (total_num_scalar_blocks - 1);

        auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-        create_trivial_radix(msb_stream, trivial_sign_block, scalar_sign_block,
-                             big_lwe_dimension, 1, 1, message_modulus,
-                             carry_modulus);
+        create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
+                             scalar_sign_block, big_lwe_dimension, 1, 1,
+                             message_modulus, carry_modulus);

        integer_radix_apply_bivariate_lookup_table_kb(
-            msb_stream, lwe_array_sign_out, encrypted_sign_block,
-            trivial_sign_block, bsk, ksk, 1, mem_ptr->signed_lut);
+            msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
+            encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
+            mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
      }
    }
-    cuda_synchronize_stream(lsb_stream);
-    cuda_synchronize_stream(msb_stream);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+      cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+    }

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
    // final sign
-    reduce_signs(stream, lwe_array_out, lwe_array_ct_out, mem_ptr,
-                 sign_handler_f, bsk, ksk, num_lsb_radix_blocks + 1);
+    reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
+                 lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
+                 num_lsb_radix_blocks + 1);
  }
 }

 template <typename Torus>
 __host__ void integer_radix_signed_scalar_maxmin_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t total_num_radix_blocks,
-    uint32_t total_num_scalar_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  // Calculates the difference sign between the ciphertext and the scalar
  // - 0 if lhs < rhs
@@ -461,8 +494,8 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
  integer_radix_signed_scalar_difference_check_kb(
-      stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
-      mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
+      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
+      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);

  // There is no optimized CMUX for scalars, so we convert to a trivial
@@ -470,66 +503,69 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(stream, lwe_array_right, scalar_blocks,
-                       params.big_lwe_dimension, total_num_radix_blocks,
-                       total_num_scalar_blocks, params.message_modulus,
-                       params.carry_modulus);
+  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
+                       scalar_blocks, params.big_lwe_dimension,
+                       total_num_radix_blocks, total_num_scalar_blocks,
+                       params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(stream, lwe_array_out, sign, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
+  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             sign, lwe_array_left, lwe_array_right,
+                             mem_ptr->cmux_buffer, bsks, ksks,
                             total_num_radix_blocks);
 }

 template <typename Torus>
 __host__ void host_integer_radix_scalar_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
    integer_radix_signed_scalar_difference_check_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
-        sign_handler_f, bsk, ksk, total_num_radix_blocks,
-        total_num_scalar_blocks);
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
+        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
+        total_num_radix_blocks, total_num_scalar_blocks);
  } else {
    integer_radix_unsigned_scalar_difference_check_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
-        sign_handler_f, bsk, ksk, total_num_radix_blocks,
-        total_num_scalar_blocks);
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
+        scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
+        total_num_radix_blocks, total_num_scalar_blocks);
  }
 }

 template <typename Torus>
 __host__ void host_integer_radix_signed_scalar_maxmin_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t total_num_radix_blocks,
-    uint32_t total_num_scalar_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

  if (mem_ptr->is_signed) {
    // is signed and scalar is positive
    integer_radix_signed_scalar_maxmin_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
-        total_num_radix_blocks, total_num_scalar_blocks);
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
+        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
+        total_num_scalar_blocks);
  } else {
    integer_radix_unsigned_scalar_maxmin_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
-        total_num_radix_blocks, total_num_scalar_blocks);
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
+        scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
+        total_num_scalar_blocks);
  }
 }

 template <typename Torus>
-__host__ void
-scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                               Torus *lwe_array_in, Torus *scalar_blocks,
-                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                               Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void scalar_compare_radix_blocks_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -549,37 +585,37 @@ scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000

  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
-  cuda_memcpy_async_gpu_to_gpu(
-      subtracted_blocks, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
+  cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
  host_integer_radix_scalar_subtraction_inplace(
-      stream, subtracted_blocks, scalar_blocks, big_lwe_dimension,
-      num_radix_blocks, message_modulus, carry_modulus);
+      streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
+      big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);

  // Apply LUT to compare to 0
  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
-                                                 subtracted_blocks, bsk, ksk,
-                                                 num_radix_blocks, sign_lut);
+  integer_radix_apply_univariate_lookup_table_kb(
+      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
+      ksks, num_radix_blocks, sign_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
-                                            big_lwe_dimension, num_radix_blocks,
-                                            message_modulus, carry_modulus);
+  host_integer_radix_add_scalar_one_inplace(
+      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
 }

 template <typename Torus>
 __host__ void host_integer_radix_scalar_maxmin_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t total_num_radix_blocks,
-    uint32_t total_num_scalar_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;

  // Calculates the difference sign between the ciphertext and the scalar
@@ -588,8 +624,8 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  // - 2 if lhs > rhs
  auto sign = mem_ptr->tmp_lwe_array_out;
  host_integer_radix_scalar_difference_check_kb(
-      stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
-      mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
+      streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
+      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
      total_num_scalar_blocks);

  // There is no optimized CMUX for scalars, so we convert to a trivial
@@ -597,23 +633,25 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  auto lwe_array_left = lwe_array_in;
  auto lwe_array_right = mem_ptr->tmp_block_comparisons;

-  create_trivial_radix(stream, lwe_array_right, scalar_blocks,
-                       params.big_lwe_dimension, total_num_radix_blocks,
-                       total_num_scalar_blocks, params.message_modulus,
-                       params.carry_modulus);
+  create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
+                       scalar_blocks, params.big_lwe_dimension,
+                       total_num_radix_blocks, total_num_scalar_blocks,
+                       params.message_modulus, params.carry_modulus);

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(
-      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
+  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
+                             lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
+                             total_num_radix_blocks);
 }

 template <typename Torus>
 __host__ void host_integer_radix_scalar_equality_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
+    int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
+    uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {

  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -641,10 +679,12 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
  auto lwe_array_msb_out =
      lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;

-  cuda_synchronize_stream(stream);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }

-  auto lsb_stream = mem_ptr->lsb_stream;
-  auto msb_stream = mem_ptr->msb_stream;
+  auto lsb_streams = mem_ptr->lsb_streams;
+  auto msb_streams = mem_ptr->msb_streams;

 #pragma omp parallel sections
  {
@@ -656,18 +696,21 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
        auto packed_scalar =
            packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;

-        pack_blocks(lsb_stream, packed_blocks, lsb, big_lwe_dimension,
-                    num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, packed_scalar, scalar_blocks, 0,
-                    num_scalar_blocks, message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
+                    big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
+        pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar,
+                    scalar_blocks, 0, num_scalar_blocks, message_modulus);

        cuda_memcpy_async_gpu_to_gpu(
-            scalar_comparison_luts->lut_indexes, packed_scalar,
-            num_halved_scalar_blocks * sizeof(Torus), lsb_stream);
+            scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
+            packed_scalar, num_halved_scalar_blocks * sizeof(Torus),
+            lsb_streams[0], gpu_indexes[0]);
+        scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

        integer_radix_apply_univariate_lookup_table_kb(
-            lsb_stream, lwe_array_lsb_out, packed_blocks, bsk, ksk,
-            num_halved_lsb_radix_blocks, scalar_comparison_luts);
+            lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
+            packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks,
+            scalar_comparison_luts);
      }
    }
 #pragma omp section
@@ -687,25 +730,29 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
          PANIC("Cuda error: integer operation not supported")
        }

-        host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        msb_lut);
+        host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
+                                        lwe_array_msb_out, msb, mem_ptr, bsks,
+                                        ksks, num_msb_radix_blocks, msb_lut);
      }
    }
  }

-  cuda_synchronize_stream(lsb_stream);
-  cuda_synchronize_stream(msb_stream);
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
+    cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
+  }

  switch (mem_ptr->op) {
  case COMPARISON_TYPE::EQ:
    are_all_comparisons_block_true(
-        stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
+        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  case COMPARISON_TYPE::NE:
    is_at_least_one_comparisons_block_true(
-        stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
+        mem_ptr, bsks, ksks,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -1,11 +1,12 @@
 #include "integer/scalar_mul.cuh"

 void scratch_cuda_integer_scalar_mul_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -13,64 +14,71 @@ void scratch_cuda_integer_scalar_mul_kb_64(
                          grouping_factor, message_modulus, carry_modulus);

  scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
-      stream, (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint64_t *decomposed_scalar,
-    uint64_t *has_at_least_one_set, int8_t *mem, void *bsk, void *ksk,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
-    uint32_t num_blocks, uint32_t num_scalars) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    uint64_t *decomposed_scalar, uint64_t *has_at_least_one_set, int8_t *mem,
+    void **bsks, void **ksks, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t message_modulus, uint32_t num_blocks, uint32_t num_scalars) {

  switch (polynomial_size) {
  case 512:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 1024:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 2048:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 4096:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 8192:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  case 16384:
    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array), decomposed_scalar,
        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), lwe_dimension, message_modulus, num_blocks,
+        num_scalars);
    break;
  default:
    PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
@@ -78,12 +86,13 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
  }
 }

-void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_scalar_mul(void **streams,
+                                           uint32_t *gpu_indexes,
+                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void) {

-  cudaSetDevice(stream->gpu_index);
  int_scalar_mul_buffer<uint64_t> *mem_ptr =
      (int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -29,33 +29,42 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,

 template <typename T>
 __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
-    cuda_stream_t *stream, int_scalar_mul_buffer<T> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
+  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
+    check_cuda_error(cudaFuncSetAttribute(
+        tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
+        sm_size));
+    cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
+                           cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(
+        cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
+    check_cuda_error(cudaGetLastError());
+  }

-  *mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
-                                          allocate_gpu_memory);
+  *mem_ptr =
+      new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
+                                   num_radix_blocks, allocate_gpu_memory);
 }

 template <typename T, class params>
 __host__ void host_integer_scalar_mul_radix(
-    cuda_stream_t *stream, T *lwe_array, T *decomposed_scalar,
-    T *has_at_least_one_set, int_scalar_mul_buffer<T> *mem, void *bsk, T *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    T *lwe_array, T *decomposed_scalar, T *has_at_least_one_set,
+    int_scalar_mul_buffer<T> *mem, void **bsks, T **ksks,
    uint32_t input_lwe_dimension, uint32_t message_modulus,
    uint32_t num_radix_blocks, uint32_t num_scalars) {

  if (num_radix_blocks == 0 | num_scalars == 0)
    return;

-  cudaSetDevice(stream->gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  uint32_t lwe_size = input_lwe_dimension + 1;
@@ -70,13 +79,15 @@ __host__ void host_integer_scalar_mul_radix(
    T *ptr = preshifted_buffer + shift_amount * lwe_size * num_radix_blocks;
    if (has_at_least_one_set[shift_amount] == 1) {
      cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
-                                   lwe_size_bytes * num_radix_blocks, stream);
+                                   lwe_size_bytes * num_radix_blocks,
+                                   streams[0], gpu_indexes[0]);
      host_integer_radix_logical_scalar_shift_kb_inplace(
-          stream, ptr, shift_amount, mem->logical_scalar_shift_buffer, bsk, ksk,
-          num_radix_blocks);
+          streams, gpu_indexes, gpu_count, ptr, shift_amount,
+          mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
    } else {
      // create trivial assign for value = 0
-      cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, stream);
+      cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, streams[0],
+                        gpu_indexes[0]);
    }
  }
  size_t j = 0;
@@ -87,37 +98,40 @@ __host__ void host_integer_scalar_mul_radix(
          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
      T *block_shift_buffer =
          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      radix_blocks_rotate_right<<<num_radix_blocks, 256, 0, stream->stream>>>(
-          block_shift_buffer, preshifted_radix_ct, i / msg_bits,
-          num_radix_blocks, lwe_size);
+      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
+                                     block_shift_buffer, preshifted_radix_ct,
+                                     i / msg_bits, num_radix_blocks, lwe_size);
      // create trivial assign for value = 0
      cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
-                        stream);
+                        streams[0], gpu_indexes[0]);
      j++;
    }
  }

  if (j == 0) {
    // lwe array = 0
-    cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes, stream);
+    cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes,
+                      streams[0], gpu_indexes[0]);
  } else {
    int terms_degree[j * num_radix_blocks];
    for (int i = 0; i < j * num_radix_blocks; i++) {
      terms_degree[i] = message_modulus - 1;
    }
    host_integer_sum_ciphertexts_vec_kb<T, params>(
-        stream, lwe_array, all_shifted_buffer, terms_degree, bsk, ksk,
-        mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
+        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
+        terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
+        num_radix_blocks, j);
  }
 }

 // Small scalar_mul is used in shift/rotate
 template <typename T>
 __host__ void host_integer_small_scalar_mul_radix(
-    cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    T *output_lwe_array, T *input_lwe_array, T scalar,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -128,7 +142,7 @@ __host__ void host_integer_small_scalar_mul_radix(
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
+  device_small_scalar_radix_multiplication<<<grid, thds, 0, streams[0]>>>(
      output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
      input_lwe_ciphertext_count);
  check_cuda_error(cudaGetLastError());
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -1,12 +1,12 @@
 #include "scalar_rotate.cuh"

 void scratch_cuda_integer_radix_scalar_rotate_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -15,27 +15,30 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
-      stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
-      params, shift_type, allocate_gpu_memory);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      shift_type, allocate_gpu_memory);
 }

-void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
-                                                    void *lwe_array, uint32_t n,
-                                                    int8_t *mem_ptr, void *bsk,
-                                                    void *ksk,
-                                                    uint32_t num_blocks) {
+void cuda_integer_radix_scalar_rotate_kb_64_inplace(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    uint32_t n, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {

  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array), n,
-      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), n,
+      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

-void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_scalar_rotate(void **streams,
+                                              uint32_t *gpu_indexes,
+                                              uint32_t gpu_count,
                                              int8_t **mem_ptr_void) {

  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -13,22 +13,22 @@

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
-    cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
+    bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
-      stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
+      allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void host_integer_radix_scalar_rotate_kb_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
-    int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
-    uint32_t num_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array, uint32_t n, int_logical_scalar_shift_buffer<Torus> *mem,
+    void **bsks, Torus **ksks, uint32_t num_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -57,11 +57,14 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  // block_count blocks will be used in the grid
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
-    radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
-        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+    // rotate right as the blocks are from LSB to MSB
+    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
+                                   rotated_buffer, lwe_array, rotations,
+                                   num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, stream);
+                                 num_blocks * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    if (shift_within_block == 0) {
      return;
@@ -69,20 +72,24 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
-        giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
+                                   giver_blocks, lwe_array, 1, num_blocks,
+                                   big_lwe_size);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
-        lut_bivariate);
+        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
+        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
+        lut_bivariate->params.message_modulus);

  } else {
-    // left shift
-    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
-        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+    // rotate left as the blocks are from LSB to MSB
+    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
+                                  rotated_buffer, lwe_array, rotations,
+                                  num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, stream);
+                                 num_blocks * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    if (shift_within_block == 0) {
      return;
@@ -90,12 +97,13 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
-        giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
+                                  lwe_array, 1, num_blocks, big_lwe_size);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
-        lut_bivariate);
+        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
+        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
+        lut_bivariate->params.message_modulus);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -1,12 +1,12 @@
 #include "scalar_shifts.cuh"

 void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -15,8 +15,9 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
-      stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
-      params, shift_type, allocate_gpu_memory);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      shift_type, allocate_gpu_memory);
 }

 /// The logical scalar shift is the one used for unsigned integers, and
@@ -24,22 +25,24 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
 /// the application of a PBS onto the rotated blocks up to num_blocks -
 /// rotations - 1 The remaining blocks are padded with zeros
 void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
-    void *bsk, void *ksk, uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {

  host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array), shift,
-      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), shift,
+      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

 void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -48,8 +51,9 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
-      stream, (int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr,
-      num_blocks, params, shift_type, allocate_gpu_memory);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
+      params, shift_type, allocate_gpu_memory);
 }

 /// The arithmetic scalar shift is the one used for the signed right shift.
@@ -60,31 +64,35 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
 /// block, which is copied onto all remaining blocks instead of padding with
 /// zeros as would be done in the logical shift.
 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
-    void *bsk, void *ksk, uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    uint32_t shift, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {

  host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array), shift,
-      (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), shift,
+      (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

-void cleanup_cuda_integer_radix_logical_scalar_shift(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_logical_scalar_shift(void **streams,
+                                                     uint32_t *gpu_indexes,
+                                                     uint32_t gpu_count,
                                                     int8_t **mem_ptr_void) {

-  cudaSetDevice(stream->gpu_index);
  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }

-void cleanup_cuda_integer_radix_arithmetic_scalar_shift(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_arithmetic_scalar_shift(void **streams,
+                                                        uint32_t *gpu_indexes,
+                                                        uint32_t gpu_count,
                                                        int8_t **mem_ptr_void) {

-  cudaSetDevice(stream->gpu_index);
  int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
      (int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -1,5 +1,5 @@
-#ifndef CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
-#define CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
+#ifndef CUDA_INTEGER_SCALAR_SHIFT_CUH
+#define CUDA_INTEGER_SCALAR_SHIFT_CUH

 #include "crypto/keyswitch.cuh"
 #include "device.h"
@@ -14,22 +14,23 @@

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
-    cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_logical_scalar_shift_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type,
+    bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
-      stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
+      allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
-    int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array, uint32_t shift,
+    int_logical_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
    uint32_t num_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -55,18 +56,21 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  // rotate right all the blocks in radix ciphertext
  // copy result in new buffer
-  // 256 threads are used in every block
+  // 1024 threads are used in every block
  // block_count blocks will be used in the grid
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
-    radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
-        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+    // rotate right as the blocks are from LSB to MSB
+    host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
+                                   rotated_buffer, lwe_array, rotations,
+                                   num_blocks, big_lwe_size);

    // create trivial assign for value = 0
    cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
-                      stream);
+                      streams[0], gpu_indexes[0]);
    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, stream);
+                                 num_blocks * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    if (shift_within_block == 0 || rotations == num_blocks) {
      return;
@@ -79,20 +83,25 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
    size_t partial_block_count = num_blocks - rotations;

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, partial_current_blocks, partial_current_blocks,
-        partial_previous_blocks, bsk, ksk, partial_block_count, lut_bivariate);
+        streams, gpu_indexes, gpu_count, partial_current_blocks,
+        partial_current_blocks, partial_previous_blocks, bsks, ksks,
+        partial_block_count, lut_bivariate,
+        lut_bivariate->params.message_modulus);

  } else {
    // right shift
-    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
-        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
+                                  rotated_buffer, lwe_array, rotations,
+                                  num_blocks, big_lwe_size);

    // rotate left as the blocks are from LSB to MSB
    // create trivial assign for value = 0
    cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size,
-                      0, rotations * big_lwe_size_bytes, stream);
+                      0, rotations * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);
    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, stream);
+                                 num_blocks * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    if (shift_within_block == 0 || rotations == num_blocks) {
      return;
@@ -104,29 +113,34 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
    size_t partial_block_count = num_blocks - rotations;

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, partial_current_blocks, partial_current_blocks,
-        partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
+        streams, gpu_indexes, gpu_count, partial_current_blocks,
+        partial_current_blocks, partial_next_blocks, bsks, ksks,
+        partial_block_count, lut_bivariate,
+        lut_bivariate->params.message_modulus);
  }
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
-    cuda_stream_t *stream, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  *mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
-      stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
+      allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
-    int_arithmetic_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array, uint32_t shift,
+    int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
    uint32_t num_blocks) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_indexes[0]);
+
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -156,10 +170,12 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

  if (mem->shift_type == RIGHT_SHIFT) {
-    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
-        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
+                                  rotated_buffer, lwe_array, rotations,
+                                  num_blocks, big_lwe_size);
    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, stream);
+                                 num_blocks * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    if (num_bits_in_block == 1) {
      // if there is only 1 bit in the msg part, it means shift_within block is
@@ -175,7 +191,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
          rotated_buffer + (num_blocks - rotations) * big_lwe_size;
      for (uint i = 0; i < num_blocks; i++) {
        cuda_memcpy_async_gpu_to_gpu(block_dest, block_src, big_lwe_size_bytes,
-                                     stream);
+                                     streams[0], gpu_indexes[0]);
        block_dest += big_lwe_size;
      }
      return;
@@ -185,47 +201,54 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
    // bit. This creates the need for a different shifting lut than in the
    // logical shift case. We also need another PBS to create the padding block.
    Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
-    cuda_memcpy_async_gpu_to_gpu(last_block_copy,
-                                 rotated_buffer + (num_blocks - rotations - 1) *
-                                                      big_lwe_size,
-                                 big_lwe_size_bytes, stream);
+    cuda_memcpy_async_gpu_to_gpu(
+        last_block_copy,
+        rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
    auto partial_current_blocks = lwe_array;
    auto partial_next_blocks = &rotated_buffer[big_lwe_size];
    size_t partial_block_count = num_blocks - rotations;
    if (shift_within_block != 0 && rotations != num_blocks) {
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-          stream, partial_current_blocks, partial_current_blocks,
-          partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
+          streams, gpu_indexes, gpu_count, partial_current_blocks,
+          partial_current_blocks, partial_next_blocks, bsks, ksks,
+          partial_block_count, lut_bivariate,
+          lut_bivariate->params.message_modulus);
    }
    // Since our CPU threads will be working on different streams we shall
    // assert the work in the main stream is completed
-    stream->synchronize();
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
 #pragma omp parallel sections
    {
      // All sections may be executed in parallel
 #pragma omp section
      {
        integer_radix_apply_univariate_lookup_table_kb(
-            mem->local_stream_1, padding_block, last_block_copy, bsk, ksk, 1,
-            lut_univariate_padding_block);
+            mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
+            last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
        // Replace blocks 'pulled' from the left with the correct padding block
        for (uint i = 0; i < rotations; i++) {
          cuda_memcpy_async_gpu_to_gpu(
              lwe_array + (num_blocks - rotations + i) * big_lwe_size,
-              padding_block, big_lwe_size_bytes, mem->local_stream_1);
+              padding_block, big_lwe_size_bytes, mem->local_streams_1[0],
+              gpu_indexes[0]);
        }
      }
 #pragma omp section
      {
        if (shift_within_block != 0 && rotations != num_blocks) {
          integer_radix_apply_univariate_lookup_table_kb(
-              mem->local_stream_2, last_block, last_block_copy, bsk, ksk, 1,
-              lut_univariate_shift_last_block);
+              mem->local_streams_2, gpu_indexes, gpu_count, last_block,
+              last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
        }
      }
    }
-    cuda_synchronize_stream(mem->local_stream_1);
-    cuda_synchronize_stream(mem->local_stream_2);
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
+    }

  } else {
    PANIC("Cuda error (scalar shift): left scalar shift is never of the "
@@ -233,4 +256,4 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  }
 }

-#endif // CUDA_SCALAR_OPS_CUH
+#endif // CUDA_SCALAR_SHIFT_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
@@ -1,13 +1,13 @@
 #include "shift_and_rotate.cuh"

 void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
-    bool allocate_gpu_memory) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
+    bool is_signed, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -15,26 +15,29 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_shift_and_rotate_kb<uint64_t>(
-      stream, (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks,
-      params, shift_type, is_signed, allocate_gpu_memory);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      shift_type, is_signed, allocate_gpu_memory);
 }

 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, void *lwe_shift, int8_t *mem_ptr,
-    void *bsk, void *ksk, uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    void *lwe_shift, int8_t *mem_ptr, void **bsks, void **ksks,
+    uint32_t num_blocks) {

  host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array),
-      static_cast<uint64_t *>(lwe_shift),
-      (int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_shift),
+      (int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), num_blocks);
 }

-void cleanup_cuda_integer_radix_shift_and_rotate(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_shift_and_rotate(void **streams,
+                                                 uint32_t *gpu_indexes,
+                                                 uint32_t gpu_count,
                                                 int8_t **mem_ptr_void) {
-
  int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
      (int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);

-  mem_ptr->release(stream);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -14,33 +14,36 @@

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
-    cuda_stream_t *stream, int_shift_and_rotate_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
-  cudaSetDevice(stream->gpu_index);
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_shift_and_rotate_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed,
+    bool allocate_gpu_memory) {
  *mem_ptr = new int_shift_and_rotate_buffer<Torus>(
-      stream, shift_type, is_signed, params, num_radix_blocks,
-      allocate_gpu_memory);
+      streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
+      num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, Torus *lwe_shift,
-    int_shift_and_rotate_buffer<Torus> *mem, void *bsk, Torus *ksk,
-    uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array, Torus *lwe_shift, int_shift_and_rotate_buffer<Torus> *mem,
+    void **bsks, Torus **ksks, uint32_t num_radix_blocks) {
  uint32_t bits_per_block = std::log2(mem->params.message_modulus);
  uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
+  if (total_nb_bits == 0)
+    return;

  auto big_lwe_dimension = mem->params.big_lwe_dimension;
  auto big_lwe_size = big_lwe_dimension + 1;
  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

-  assert(total_nb_bits > 0);
+  cudaSetDevice(gpu_indexes[0]);

  // Extract all bits
  auto bits = mem->tmp_bits;
-  extract_n_bits<Torus>(stream, bits, lwe_array, bsk, ksk, num_radix_blocks,
-                        bits_per_block, mem->bit_extract_luts);
+  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
+                        ksks, num_radix_blocks, bits_per_block,
+                        mem->bit_extract_luts);

  // Extract shift bits
  auto shift_bits = mem->tmp_shift_bits;
@@ -59,8 +62,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  // Extracts bits and put them in the bit index 2 (=> bit number 3)
  // so that it is already aligned to the correct position of the cmux input
  // and we reduce noise growth
-  extract_n_bits<Torus>(stream, shift_bits, lwe_shift, bsk, ksk, 1,
-                        max_num_bits_that_tell_shift,
+  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
+                        bsks, ksks, 1, max_num_bits_that_tell_shift,
                        mem->bit_extract_luts_with_offset_2);

  // If signed, do an "arithmetic shift" by padding with the sign bit
@@ -74,48 +77,59 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  auto mux_inputs = mem->tmp_mux_inputs;

  cuda_memcpy_async_gpu_to_gpu(input_bits_a, bits,
-                               total_nb_bits * big_lwe_size_bytes, stream);
+                               total_nb_bits * big_lwe_size_bytes, streams[0],
+                               gpu_indexes[0]);
  for (int d = 0; d < max_num_bits_that_tell_shift; d++) {
    auto shift_bit = shift_bits + d * big_lwe_size;

    cuda_memcpy_async_gpu_to_gpu(input_bits_b, input_bits_a,
-                                 total_nb_bits * big_lwe_size_bytes, stream);
+                                 total_nb_bits * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    auto rotations = 1 << d;
    switch (mem->shift_type) {
    case LEFT_SHIFT:
-      radix_blocks_rotate_right<<<total_nb_bits, 256, 0, stream->stream>>>(
-          rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
+      // rotate right as the blocks are from LSB to MSB
+      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
+                                     rotated_input, input_bits_b, rotations,
+                                     total_nb_bits, big_lwe_size);

      if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
        for (int i = 0; i < rotations; i++)
          cuda_memcpy_async_gpu_to_gpu(rotated_input + i * big_lwe_size,
-                                       last_bit, big_lwe_size_bytes, stream);
+                                       last_bit, big_lwe_size_bytes, streams[0],
+                                       gpu_indexes[0]);
      else
        cuda_memset_async(rotated_input, 0, rotations * big_lwe_size_bytes,
-                          stream);
+                          streams[0], gpu_indexes[0]);
      break;
    case RIGHT_SHIFT:
-      radix_blocks_rotate_left<<<total_nb_bits, 256, 0, stream->stream>>>(
-          rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
+      // rotate left as the blocks are from LSB to MSB
+      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
+                                    rotated_input, input_bits_b, rotations,
+                                    total_nb_bits, big_lwe_size);

      if (mem->is_signed)
        for (int i = 0; i < rotations; i++)
          cuda_memcpy_async_gpu_to_gpu(
              rotated_input + (total_nb_bits - rotations + i) * big_lwe_size,
-              last_bit, big_lwe_size_bytes, stream);
+              last_bit, big_lwe_size_bytes, streams[0], gpu_indexes[0]);
      else
-        cuda_memset_async(rotated_input +
-                              (total_nb_bits - rotations) * big_lwe_size,
-                          0, rotations * big_lwe_size_bytes, stream);
+        cuda_memset_async(
+            rotated_input + (total_nb_bits - rotations) * big_lwe_size, 0,
+            rotations * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
      break;
    case LEFT_ROTATE:
-      radix_blocks_rotate_right<<<total_nb_bits, 256, 0, stream->stream>>>(
-          rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
+      // rotate right as the blocks are from LSB to MSB
+      host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
+                                     rotated_input, input_bits_b, rotations,
+                                     total_nb_bits, big_lwe_size);
      break;
    case RIGHT_ROTATE:
-      radix_blocks_rotate_left<<<total_nb_bits, 256, 0, stream->stream>>>(
-          rotated_input, input_bits_b, rotations, total_nb_bits, big_lwe_size);
+      // rotate left as the blocks are from LSB to MSB
+      host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
+                                    rotated_input, input_bits_b, rotations,
+                                    total_nb_bits, big_lwe_size);
      break;
    default:
      PANIC("Unknown operation")
@@ -124,21 +138,23 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // pack bits into one block so that we have
    // control_bit|b|a
    cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
-                      stream); // Do we need this?
-    pack_bivariate_blocks(stream, mux_inputs, mux_lut->lwe_indexes_out,
-                          rotated_input, input_bits_a, mux_lut->lwe_indexes_in,
-                          big_lwe_dimension, 2, total_nb_bits);
+                      streams[0], gpu_indexes[0]); // Do we need this?
+    pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
+                          mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
+                          mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
+                          total_nb_bits);

    // The shift bit is already properly aligned/positioned
    for (int i = 0; i < total_nb_bits; i++)
-      host_addition(stream, mux_inputs + i * big_lwe_size,
+      host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
                    mux_inputs + i * big_lwe_size, shift_bit,
                    mem->params.big_lwe_dimension, 1);

    // we have
    // control_bit|b|a
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, input_bits_a, mux_inputs, bsk, ksk, total_nb_bits, mux_lut);
+        streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
+        total_nb_bits, mux_lut);
  }

  // Initializes the output
@@ -147,7 +163,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  last_bit = input_bits_a + (bits_per_block - 1) * big_lwe_size;
  for (int i = 0; i < num_radix_blocks; i++) {
    cuda_memcpy_async_gpu_to_gpu(lwe_last_out, last_bit, big_lwe_size_bytes,
-                                 stream);
+                                 streams[0], gpu_indexes[0]);

    lwe_last_out += big_lwe_size;
    last_bit += bits_per_block * big_lwe_size;
@@ -158,14 +174,15 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  for (int i = bits_per_block - 2; i >= 0; i--) {

    host_integer_small_scalar_mul_radix<Torus>(
-        stream, lwe_last_out, lwe_last_out, 2, big_lwe_dimension,
-        num_radix_blocks);
+        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, 2,
+        big_lwe_dimension, num_radix_blocks);

    auto block = lwe_last_out;
    auto bit_to_add = input_bits_a + i * big_lwe_size;

    for (int j = 0; j < num_radix_blocks; j++) {
-      host_addition(stream, block, block, bit_to_add, big_lwe_dimension, 1);
+      host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
+                    big_lwe_dimension, 1);

      block += big_lwe_size;
      bit_to_add += bits_per_block * big_lwe_size;
@@ -174,8 +191,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, lwe_last_out, lwe_last_out, bsk, ksk, num_radix_blocks,
-        cleaning_lut);
+        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
+        num_radix_blocks, cleaning_lut);
  }
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -4,14 +4,15 @@
 * Perform the addition of two u32 input LWE ciphertext vectors.
 * See the equivalent operation on u64 ciphertexts for more details.
 */
-void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
                                       void *lwe_array_in_1,
                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(stream, static_cast<uint32_t *>(lwe_array_out),
+  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
+                static_cast<uint32_t *>(lwe_array_out),
                static_cast<uint32_t *>(lwe_array_in_1),
                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
                input_lwe_ciphertext_count);
@@ -43,14 +44,15 @@ void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
 * vectors are left unchanged. This function is a wrapper to a device function
 * that performs the operation on the GPU.
 */
-void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
                                       void *lwe_array_in_1,
                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count) {

-  host_addition(stream, static_cast<uint64_t *>(lwe_array_out),
+  host_addition(static_cast<cudaStream_t>(stream), gpu_index,
+                static_cast<uint64_t *>(lwe_array_out),
                static_cast<uint64_t *>(lwe_array_in_1),
                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
                input_lwe_ciphertext_count);
@@ -60,11 +62,12 @@ void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
 * plaintext vector. See the equivalent operation on u64 data for more details.
 */
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(stream, static_cast<uint32_t *>(lwe_array_out),
+  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
                          static_cast<uint32_t *>(lwe_array_in),
                          static_cast<uint32_t *>(plaintext_array_in),
                          input_lwe_dimension, input_lwe_ciphertext_count);
@@ -98,11 +101,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
 * performs the operation on the GPU.
 */
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_addition_plaintext(stream, static_cast<uint64_t *>(lwe_array_out),
+  host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
                          static_cast<uint64_t *>(lwe_array_in),
                          static_cast<uint64_t *>(plaintext_array_in),
                          input_lwe_dimension, input_lwe_ciphertext_count);
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -6,9 +6,10 @@
 #include <cuda_runtime.h>
 #endif

-#include "../utils/kernel_dimensions.cuh"
 #include "device.h"
+#include "helper_multi_gpu.h"
 #include "linear_algebra.h"
+#include "utils/kernel_dimensions.cuh"
 #include <stdio.h>

 template <typename T>
@@ -27,21 +28,22 @@ __global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
 }

 template <typename T>
-__host__ void host_addition_plaintext(cuda_stream_t *stream, T *output,
-                                      T *lwe_input, T *plaintext_input,
-                                      uint32_t lwe_dimension,
-                                      uint32_t lwe_ciphertext_count) {
+__host__ void
+host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
+                        T *lwe_input, T *plaintext_input,
+                        uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  cuda_memcpy_async_gpu_to_gpu(
-      output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count, stream);
-  plaintext_addition<<<grid, thds, 0, stream->stream>>>(
+  cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
+                               (lwe_dimension + 1) * lwe_ciphertext_count,
+                               stream, gpu_index);
+  plaintext_addition<<<grid, thds, 0, stream>>>(
      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -60,11 +62,12 @@ __global__ void addition(T *output, T *input_1, T *input_2,

 // Coefficient-wise addition
 template <typename T>
-__host__ void host_addition(cuda_stream_t *stream, T *output, T *input_1,
-                            T *input_2, uint32_t input_lwe_dimension,
+__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
+                            T *input_1, T *input_2,
+                            uint32_t input_lwe_dimension,
                            uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -75,8 +78,7 @@ __host__ void host_addition(cuda_stream_t *stream, T *output, T *input_1,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  addition<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
-                                              num_entries);
+  addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -94,11 +96,12 @@ __global__ void subtraction(T *output, T *input_1, T *input_2,

 // Coefficient-wise subtraction
 template <typename T>
-__host__ void host_subtraction(cuda_stream_t *stream, T *output, T *input_1,
-                               T *input_2, uint32_t input_lwe_dimension,
+__host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
+                               T *output, T *input_1, T *input_2,
+                               uint32_t input_lwe_dimension,
                               uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -109,8 +112,7 @@ __host__ void host_subtraction(cuda_stream_t *stream, T *output, T *input_1,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  subtraction<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
-                                                 num_entries);
+  subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
 }

@@ -130,12 +132,13 @@ __global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
 }

 template <typename T>
-__host__ void host_subtraction_plaintext(cuda_stream_t *stream, T *output,
+__host__ void host_subtraction_plaintext(cudaStream_t stream,
+                                         uint32_t gpu_index, T *output,
                                         T *lwe_input, T *plaintext_input,
                                         uint32_t input_lwe_dimension,
                                         uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = input_lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -145,9 +148,9 @@ __host__ void host_subtraction_plaintext(cuda_stream_t *stream, T *output,
  cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
                               input_lwe_ciphertext_count *
                                   (input_lwe_dimension + 1) * sizeof(T),
-                               stream);
+                               stream, gpu_index);

-  radix_body_subtraction_inplace<<<grid, thds, 0, stream->stream>>>(
+  radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
      output, plaintext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
@@ -175,11 +178,11 @@ __global__ void unchecked_sub_with_correcting_term(
 template <typename T>

 __host__ void host_unchecked_sub_with_correcting_term(
-    cuda_stream_t *stream, T *output, T *input_1, T *input_2,
+    cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t degree) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -190,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  unchecked_sub_with_correcting_term<<<grid, thds, 0, stream->stream>>>(
+  unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
      output, input_1, input_2, num_entries, lwe_size, message_modulus,
      carry_modulus, degree);
  check_cuda_error(cudaGetLastError());
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -5,11 +5,12 @@
 * cleartext vector. See the equivalent operation on u64 data for more details.
 */
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_multiplication(stream, static_cast<uint32_t *>(lwe_array_out),
+  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
+                                static_cast<uint32_t *>(lwe_array_out),
                                static_cast<uint32_t *>(lwe_array_in),
                                static_cast<uint32_t *>(cleartext_array_in),
                                input_lwe_dimension,
@@ -44,11 +45,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
 * function that performs the operation on the GPU.
 */
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count) {

-  host_cleartext_multiplication(stream, static_cast<uint64_t *>(lwe_array_out),
+  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
+                                static_cast<uint64_t *>(lwe_array_out),
                                static_cast<uint64_t *>(lwe_array_in),
                                static_cast<uint64_t *>(cleartext_array_in),
                                input_lwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -29,11 +29,12 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,

 template <typename T>
 __host__ void
-host_cleartext_multiplication(cuda_stream_t *stream, T *output, T *lwe_input,
-                              T *cleartext_input, uint32_t input_lwe_dimension,
+host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
+                              T *output, T *lwe_input, T *cleartext_input,
+                              uint32_t input_lwe_dimension,
                              uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -44,7 +45,7 @@ host_cleartext_multiplication(cuda_stream_t *stream, T *output, T *lwe_input,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  cleartext_multiplication<<<grid, thds, 0, stream->stream>>>(
+  cleartext_multiplication<<<grid, thds, 0, stream>>>(
      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
@@ -4,13 +4,14 @@
 * Perform the negation of a u32 input LWE ciphertext vector.
 * See the equivalent operation on u64 ciphertexts for more details.
 */
-void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(stream, static_cast<uint32_t *>(lwe_array_out),
+  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
+                static_cast<uint32_t *>(lwe_array_out),
                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
                input_lwe_ciphertext_count);
 }
@@ -37,13 +38,14 @@ void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
 * LWE ciphertext vector is left unchanged. This function is a wrapper to a
 * device function that performs the operation on the GPU.
 */
-void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count) {

-  host_negation(stream, static_cast<uint64_t *>(lwe_array_out),
+  host_negation(static_cast<cudaStream_t>(stream), gpu_index,
+                static_cast<uint64_t *>(lwe_array_out),
                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
                input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -22,11 +22,11 @@ __global__ void negation(T *output, T *input, uint32_t num_entries) {
 }

 template <typename T>
-__host__ void host_negation(cuda_stream_t *stream, T *output, T *input,
-                            uint32_t input_lwe_dimension,
+__host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
+                            T *input, uint32_t input_lwe_dimension,
                            uint32_t input_lwe_ciphertext_count) {

-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -37,7 +37,7 @@ __host__ void host_negation(cuda_stream_t *stream, T *output, T *input,
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);

-  negation<<<grid, thds, 0, stream->stream>>>(output, input, num_entries);
+  negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstraping_key.cu
@@ -1,36 +1,38 @@
 #include "bootstrapping_key.cuh"

 void cuda_convert_lwe_programmable_bootstrap_key_32(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) {
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size) {
  uint32_t total_polynomials =
      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
  cuda_convert_lwe_programmable_bootstrap_key<uint32_t, int32_t>(
-      (double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
-      level_count, polynomial_size, total_polynomials);
+      static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
+      (int32_t *)src, polynomial_size, total_polynomials);
 }

 void cuda_convert_lwe_programmable_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size) {
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size) {
  uint32_t total_polynomials =
      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
  cuda_convert_lwe_programmable_bootstrap_key<uint64_t, int64_t>(
-      (double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
-      level_count, polynomial_size, total_polynomials);
+      static_cast<cudaStream_t>(stream), gpu_index, (double2 *)dest,
+      (int64_t *)src, polynomial_size, total_polynomials);
 }

 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
-    uint32_t grouping_factor) {
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor) {
  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
                               level_count * (1 << grouping_factor) /
                               grouping_factor;
  size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);

  cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
-                           stream);
+                           static_cast<cudaStream_t>(stream), gpu_index);
 }

 // We need these lines so the compiler knows how to specialize these functions
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -60,12 +60,12 @@ __device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
 }
 ////////////////////////////////////////////////
 template <typename T, typename ST>
-void cuda_convert_lwe_programmable_bootstrap_key(
-    double2 *dest, ST *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
-    uint32_t total_polynomials) {
-
-  cudaSetDevice(stream->gpu_index);
+void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
+                                                 uint32_t gpu_index,
+                                                 double2 *dest, ST *src,
+                                                 uint32_t polynomial_size,
+                                                 uint32_t total_polynomials) {
+  cudaSetDevice(gpu_index);
  int shared_memory_size = sizeof(double) * polynomial_size;

  // Here the buffer size is the size of double2 times the number of polynomials
@@ -79,7 +79,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(

  double2 *h_bsk = (double2 *)malloc(buffer_size);

-  double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream);
+  double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);

  // compress real bsk to complex and divide it on DOUBLE_MAX
  for (int i = 0; i < total_polynomials; i++) {
@@ -96,12 +96,12 @@ void cuda_convert_lwe_programmable_bootstrap_key(
    }
  }

-  cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
+  cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);

-  double2 *buffer = (double2 *)cuda_malloc_async(0, stream);
+  double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
  switch (polynomial_size) {
  case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -109,17 +109,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              d_bsk, dest, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
+                                                                buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+          <<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
    }
    break;
  case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -127,17 +127,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              d_bsk, dest, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
+                                                                buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+          <<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
    }
    break;
  case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -145,17 +145,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              d_bsk, dest, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
+                                                                buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+          <<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
    }
    break;
  case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -163,17 +163,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              d_bsk, dest, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
+                                                                buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+          <<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
    }
    break;
  case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -181,17 +181,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              d_bsk, dest, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
+                                                                buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+          <<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
    }
    break;
  case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -199,17 +199,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              d_bsk, dest, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
+                                                                buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+          <<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
    }
    break;
  case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -217,13 +217,13 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
          cudaFuncCachePreferShared));
      batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              d_bsk, dest, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
+                                                                buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+          <<<gridSize, blockSize, 0, stream>>>(d_bsk, dest, buffer);
    }
    break;
  default:
@@ -231,16 +231,17 @@ void cuda_convert_lwe_programmable_bootstrap_key(
          "N's are powers of two in the interval [256..16384].")
  }

-  cuda_drop_async(d_bsk, stream);
-  cuda_drop_async(buffer, stream);
+  cuda_drop_async(d_bsk, stream, gpu_index);
+  cuda_drop_async(buffer, stream, gpu_index);
  free(h_bsk);
 }

-void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
-                                 cuda_stream_t *stream,
+void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
+                                 void *_input1, void *_input2, void *_output,
                                 uint32_t polynomial_size,
                                 uint32_t total_polynomials) {

+  cudaSetDevice(gpu_index);
  auto input1 = (double2 *)_input1;
  auto input2 = (double2 *)_input2;
  auto output = (double2 *)_output;
@@ -253,8 +254,8 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
  double2 *buffer;
  switch (polynomial_size) {
  case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
                               FULLSM>,
@@ -264,19 +265,18 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
                               FULLSM>,
          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              input1, input2, output, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
+                                                                output, buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
-                                                       buffer);
+          <<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
    }
    break;
  case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
                               FULLSM>,
@@ -286,19 +286,18 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
                               FULLSM>,
          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              input1, input2, output, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
+                                                                output, buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
-                                                       buffer);
+          <<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
    }
    break;
  case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
                               FULLSM>,
@@ -308,19 +307,18 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
                               FULLSM>,
          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              input1, input2, output, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
+                                                                output, buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
-                                                       buffer);
+          <<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
    }
    break;
  case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
                               FULLSM>,
@@ -330,19 +328,18 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
                               FULLSM>,
          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              input1, input2, output, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
+                                                                output, buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
-                                                       buffer);
+          <<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
    }
    break;
  case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
                               FULLSM>,
@@ -352,19 +349,18 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
                               FULLSM>,
          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              input1, input2, output, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
+                                                                output, buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
-                                                       buffer);
+          <<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
    }
    break;
  case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
                               FULLSM>,
@@ -374,19 +370,18 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
                               FULLSM>,
          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              input1, input2, output, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
+                                                                output, buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
-                                                       buffer);
+          <<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
    }
    break;
  case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
                               FULLSM>,
@@ -397,20 +392,19 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
          cudaFuncCachePreferShared));
      batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
                           FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
-              input1, input2, output, buffer);
+          <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
+                                                                output, buffer);
    } else {
      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream);
+          shared_memory_size * total_polynomials, stream, gpu_index);
      batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
-                                                       buffer);
+          <<<gridSize, blockSize, 0, stream>>>(input1, input2, output, buffer);
    }
    break;
  default:
    break;
  }
-  cuda_drop_async(buffer, stream);
+  cuda_drop_async(buffer, stream, gpu_index);
 }

 #endif // CNCRT_BSK_H
--- a/Show More
+++ b/Show More