fix(gpu): fix naming in shift/rotate

chore(ci): use only pull_request event in workflow
Using pull_request_target event to handle PR from forks was clashing with pull_request event. It would launch double amount of actions and moreover leads to cancellation in jobs due to the concurrency directive.
2026-01-11 15:48:20 -05:00 · 2024-06-03 09:59:21 +02:00 · 2024-05-30 15:29:28 +02:00 · 2024-05-30 11:16:18 +02:00 · 2024-05-30 11:16:18 +02:00 · 2024-05-30 09:24:25 +02:00
909 changed files with 124828 additions and 51834 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,6 +1,6 @@
 ---
 name: Bug report
-about: Report a problem with concrete
+about: Report a problem with TFHE-rs
 title: ''
 labels: triage_required
 assignees: ''
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,6 +1,6 @@
 ---
 name: Feature request
-about: Suggest an idea for concrete
+about: Suggest an idea for TFHE-rs
 title: ''
 labels: feature_request
 assignees: ''
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -0,0 +1,9 @@
+self-hosted-runner:
+  # Labels of self-hosted runner in array of strings.
+  labels:
+    - m1mac
+    - 4090-desktop
+# Configuration variables in array of strings defined in your repository or
+# organization. `null` means disabling configuration variables check.
+# Empty array means no configuration variable is allowed.
+config-variables: null
--- a/.github/workflows/approve_label.yml
+++ b/.github/workflows/approve_label.yml
@@ -0,0 +1,36 @@
+# Add labels in pull request
+name: PR label manager
+
+on:
+  pull_request:
+  pull_request_review:
+    types: [submitted]
+
+jobs:
+  trigger-tests:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Get current labels
+        uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
+
+      # Remove label if a push is performed after an approval
+      - name: Remove approved label
+        if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
+        uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
+        with:
+          # We use a PAT to have the same user (zama-bot) for label deletion as for creation.
+          github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          labels: approved
+
+      # Add label only if the review is approved and if the label doesn't already exist
+      - name: Add approved label
+        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
+        if: ${{ github.event_name == 'pull_request_review' 
+          && github.event.review.state == 'approved'
+          && !contains(fromJSON(env.LABELS), 'approved') }}
+        with:
+          # We need to use a PAT to be able to trigger `labeled` event for the other workflow.
+          github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          labels: approved
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -5,71 +5,66 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string
+  pull_request:

 jobs:
-  fast-tests:
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
+  setup-instance:
+    name: Setup instance (fast-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+  fast-tests:
+    name: Fast CPU tests
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true

      - name: Run concrete-csprng tests
        run: |
          make test_concrete_csprng

+      - name: Run tfhe-zk-pok tests
+        run: |
+          make test_zk_pok
+
      - name: Run core tests
        run: |
          AVX512_SUPPORT=ON make test_core_crypto
@@ -117,11 +112,31 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (fast-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, fast-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -0,0 +1,77 @@
+# Compile and test tfhe-cuda-backend on an RTX 4090 machine
+name: TFHE Cuda Backend - 4090 full tests
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]
+
+jobs:
+  cuda-tests-linux:
+    name: CUDA tests (RTX 4090)
+    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, '4090_test') }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ["self-hosted", "4090-desktop"]
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Run fmt checks
+        run: |
+          make check_fmt_gpu
+
+      - name: Run clippy checks
+        run: |
+          make pcc_gpu
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
+        if: ${{ always() && github.event_name == 'pull_request' }}
+        with:
+          labels: 4090_test
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -1,45 +1,47 @@
-# Compile and test Concrete-cuda on an AWS instance
-name: Concrete Cuda - Full tests
+# Compile and test tfhe-cuda-backend on an AWS instance
+name: TFHE Cuda Backend - Full tests

 env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string
+  pull_request:

 jobs:
-  run-cuda-tests-linux:
+  setup-instance:
+    name: Setup instance (cuda-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: gpu-test
+
+  cuda-pcc:
+    name: CUDA post-commit checks
+    needs: setup-instance
    concurrency:
-      group: tfhe_cuda_backend_test-${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    name: Test code in EC2
-    runs-on: ${{ inputs.runner_name }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -52,31 +54,19 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}

    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
      - name: Checkout tfhe-rs
-        uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
@@ -90,15 +80,125 @@ jobs:
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Run fmt checks
+        run: |
+          make check_fmt_gpu

      - name: Run clippy checks
        run: |
-          make clippy_gpu
+          make pcc_gpu

-      - name: Run all tests
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  cuda-tests-linux:
+    name: CUDA tests
+    needs: [ setup-instance, cuda-pcc ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-pcc, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -4,66 +4,59 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      fork_repo:
-        description: "Name of forked repo as user/repo"
-        type: string
-      fork_git_sha:
-        description: "Git SHA to checkout from fork"
-        type: string
+  pull_request:
+    types: [ labeled ]

 jobs:
-  integer-tests:
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
+  setup-instance:
+    name: Setup instance (unsigned-integer-tests)
+    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+  unsigned-integer-tests:
+    name: Unsigned integer tests
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true

      - name: Gen Keys if required
        run: |
@@ -84,11 +77,31 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (unsigned-integer-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, unsigned-integer-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -4,66 +4,59 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      fork_repo:
-        description: "Name of forked repo as user/repo"
-        type: string
-      fork_git_sha:
-        description: "Git SHA to checkout from fork"
-        type: string
+  pull_request:
+    types: [ labeled ]

 jobs:
-  multi-bit-tests:
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
+  setup-instance:
+    name: Setup instance (signed-integer-tests)
+    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+  signed-integer-tests:
+    name: Signed integer tests
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true

      - name: Gen Keys if required
        run: |
@@ -88,11 +81,31 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (signed-integer-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, signed-integer-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -4,105 +4,215 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string
+  pull_request:
+    types: [ labeled ]
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 jobs:
-  shortint-tests:
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    outputs:
+      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
+      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
+      core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.core_crypto_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      boolean_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.boolean_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      shortint_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.shortint_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      high_level_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.high_level_api_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      c_api_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.c_api_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      examples_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.examples_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      apps_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.apps_any_changed || steps.changed-files.outputs.dependencies_any_changed }}
+      user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
+        steps.changed-files.outputs.user_docs_any_changed ||
+        steps.changed-files.outputs.dependencies_any_changed }}
+      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          fetch-depth: 0
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@03334d095e2739fa9ac4034ec16f66d5d01e9eba
+        with:
+          since_last_remote_commit: true
+          files_yaml: |
+            dependencies:
+              - tfhe/Cargo.toml
+              - concrete-csprng/**
+              - tfhe-zk-pok/**
+            csprng:
+              - concrete-csprng/**
+            zk_pok:
+              - tfhe-zk-pok/**
+            core_crypto:
+              - tfhe/src/core_crypto/**
+            boolean:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/boolean/**
+            shortint:
+              - tfhe/src/core_crypto/**
+              - tfhe/src/shortint/**
+            high_level_api:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+            c_api:
+              - tfhe/src/**
+            examples:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - tfhe/examples/**
+            apps:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - apps/trivium/src/**
+            user_docs:
+              - tfhe/src/**
+              - '!tfhe/src/c_api/**'
+              - 'tfhe/docs/**.md'
+              - README.md
+
+      - name: Aggregate file changes
+        id: aggregated-changes
+        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
+          steps.changed-files.outputs.csprng_any_changed == 'true' ||
+          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
+          steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
+          steps.changed-files.outputs.boolean_any_changed == 'true' ||
+          steps.changed-files.outputs.shortint_any_changed == 'true' ||
+          steps.changed-files.outputs.high_level_api_any_changed == 'true' ||
+          steps.changed-files.outputs.c_api_any_changed == 'true' ||
+          steps.changed-files.outputs.examples_any_changed == 'true' ||
+          steps.changed-files.outputs.apps_any_changed == 'true' ||
+          steps.changed-files.outputs.user_docs_any_changed == 'true')
+        run: |
+          echo "any_changed=true" >> "$GITHUB_OUTPUT"
+
+  setup-instance:
+    name: Setup instance (cpu-tests)
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.should-run.outputs.any_file_changed == 'true')
+    needs: should-run
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-big
+
+  cpu-tests:
+    name: CPU tests
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    needs: [ should-run, setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true

      - name: Run concrete-csprng tests
+        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
          make test_concrete_csprng

+      - name: Run tfhe-zk-pok tests
+        if: needs.should-run.outputs.zk_pok_test == 'true'
+        run: |
+          make test_zk_pok
+
      - name: Run core tests
+        if: needs.should-run.outputs.core_crypto_test == 'true'
        run: |
          AVX512_SUPPORT=ON make test_core_crypto

      - name: Run boolean tests
+        if: needs.should-run.outputs.boolean_test == 'true'
        run: |
          make test_boolean

      - name: Run C API tests
+        if: needs.should-run.outputs.c_api_test == 'true'
        run: |
          make test_c_api

      - name: Run user docs tests
+        if: needs.should-run.outputs.user_docs_test == 'true'
        run: |
          make test_user_doc

      - name: Gen Keys if required
+        if: needs.should-run.outputs.shortint_test == 'true'
        run: |
          make gen_key_cache

      - name: Run shortint tests
+        if: needs.should-run.outputs.shortint_test == 'true'
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_shortint_ci

      - name: Run high-level API tests
+        if: needs.should-run.outputs.high_level_api_test == 'true'
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_high_level_api

      - name: Run example tests
+        if: needs.should-run.outputs.examples_test == 'true'
        run: |
          make test_examples
          make dark_market

      - name: Run apps tests
+        if: needs.should-run.outputs.apps_test == 'true'
        run: |
          make test_trivium
          make test_kreyvium
@@ -110,11 +220,31 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cpu-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cpu-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -4,66 +4,67 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string
+  pull_request:
+    types: [ labeled ]

 jobs:
-  wasm-tests:
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
+  setup-instance:
+    name: Setup instance (wasm-tests)
+    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  wasm-tests:
+    name: WASM tests
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true
+
+      - name: Install Node
+        run: |
+          make install_node
+
+      - name: Run fmt checks
+        run: |
+          make check_fmt_js

      - name: Run js on wasm API tests
        run: |
@@ -71,17 +72,36 @@ jobs:

      - name: Run parallel wasm tests
        run: |
-          make install_node
          make ci_test_web_js_api_parallel

      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-instance:
+    name: Teardown instance (wasm-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, wasm-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -32,6 +32,8 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-boolean-benchmarks:
@@ -51,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -61,14 +63,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON bench_boolean
+          make bench_boolean

      - name: Parse results
        run: |
@@ -96,17 +97,17 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -125,11 +126,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Boolean benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -6,6 +6,8 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -17,11 +19,11 @@ jobs:

    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest-large, windows-latest]
      fail-fast: false

    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29

      - name: Install and run newline linter checks
        if: matrix.os == 'ubuntu-latest'
@@ -66,5 +68,9 @@ jobs:
        run: |
          make build_c_api

+      - name: Build coverage tests
+        run: |
+          make build_tfhe_coverage
+
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -0,0 +1,27 @@
+# Lint and check CI
+name: CI Lint and Checks
+
+on:
+  pull_request:
+
+env:
+  ACTIONLINT_VERSION: 1.6.27
+
+jobs:
+  lint-check:
+    name: Lint and checks
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+
+      - name: Get actionlint
+        run: |
+          bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
+          echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782  actionlint" > checksum
+          sha256sum -c checksum
+          ln -s "$(pwd)/actionlint" /usr/local/bin/
+
+      - name: Lint workflows
+        run: |
+          make lint_workflow
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -4,6 +4,8 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -38,7 +40,7 @@ jobs:
      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
    runs-on: ${{ inputs.runner_name }}
-    timeout-minutes: 1080
+    timeout-minutes: 11520 # 8 days
    steps:
      # Step used for log purpose.
      - name: Instance configuration used
@@ -51,7 +53,7 @@ jobs:
          echo "Fork git sha: ${{ inputs.fork_git_sha }}"

      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: ${{ inputs.fork_repo }}
          ref: ${{ inputs.fork_git_sha }}
@@ -61,14 +63,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@62f4729b5df35e6e0e01265fa70a82ccaf196b4b
+        uses: tj-actions/changed-files@03334d095e2739fa9ac4034ec16f66d5d01e9eba
        with:
          files_yaml: |
            tfhe:
@@ -98,7 +99,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
+        uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -106,10 +107,24 @@ jobs:
          fail_ci_if_error: true
          files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml

+      - name: Run integer coverage
+        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+        run: |
+          make test_integer_cov
+
+      - name: Upload tfhe coverage to Codecov
+        uses: codecov/codecov-action@125fc84a9a348dbcf27191600683ec096ec9021c
+        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          directory: ./coverage/
+          fail_ci_if_error: true
+          files: integer/cobertura.xml
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -1,5 +1,5 @@
-# Run PBS benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: PBS benchmarks
+# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: Core crypto benchmarks

 on:
  workflow_dispatch:
@@ -32,10 +32,12 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
-  run-pbs-benchmarks:
-    name: Execute PBS benchmarks in EC2
+  run-core-crypto-benchmarks:
+    name: Execute core crypto benchmarks in EC2
    runs-on: ${{ github.event.inputs.runner_name }}
    if: ${{ !cancelled() }}
    steps:
@@ -51,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -61,14 +63,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON bench_pbs
+          make bench_pbs
+          make bench_ks

      - name: Parse results
        run: |
@@ -86,17 +88,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
-          name: ${{ github.sha }}_pbs
+          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -115,11 +117,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "PBS benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -0,0 +1,200 @@
+# Run core crypto benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Core crypto GPU benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-core-crypto-benchmarks)
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-core-crypto-benchmarks:
+    name: Execute GPU core crypto benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo \
+          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make bench_pbs_gpu
+          make bench_ks_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "n3-H100x1" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512 \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_core_crypto
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-core-crypto-benchmarks ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-core-crypto-benchmarks.result }}
+          SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ needs.cuda-core-crypto-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/csprng_randomness_testing.yml
+++ b/.github/workflows/csprng_randomness_testing.yml
@@ -1,74 +0,0 @@
-name: CSPRNG randomness testing Workflow
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string
-
-jobs:
-  csprng-randomness-teting:
-    name: CSPRNG randomness testing
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
-      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
-
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
-
-      - name: Set up home
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
-        with:
-          toolchain: stable
-          default: true
-
-      - name: Dieharder randomness test suite
-        run: |
-          make dieharder_csprng
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -0,0 +1,95 @@
+name: CSPRNG randomness testing Workflow
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [ labeled ]
+
+jobs:
+  setup-instance:
+    name: Setup instance (csprng-randomness-tests)
+    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: cpu-small
+
+  csprng-randomness-tests:
+    name: CSPRNG randomness tests
+    needs: setup-instance
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: true
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Dieharder randomness test suite
+        run: |
+          make dieharder_csprng
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (csprng-randomness-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, csprng-randomness-tests ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -0,0 +1,202 @@
+# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
+name: TFHE Cuda Backend - 4090 full benchmarks
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [labeled]
+  schedule:
+    # Weekly benchmarks will be triggered each Friday at 9p.m.
+    - cron: "0 21 * * 5"
+
+jobs:
+  cuda-integer-benchmarks:
+    name: Cuda integer benchmarks for all operations flavor  (RTX 4090)
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
+      cancel-in-progress: true
+    runs-on: ["self-hosted", "4090-desktop"]
+    timeout-minutes: 1440 # 24 hours
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        command: [integer, integer_multi_bit]
+        op_flavor: [default, unchecked]
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run integer benchmarks
+        run: |
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "rtx4090" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  cuda-core-crypto-benchmarks:
+    name: Cuda core crypto benchmarks  (RTX 4090)
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
+    needs: cuda-integer-benchmarks
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
+      cancel-in-progress: true
+    runs-on: ["self-hosted", "4090-desktop"]
+    timeout-minutes: 1440 # 24 hours
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Run integer benchmarks
+        run: |
+          make bench_pbs_gpu
+          make bench_ks_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "rtx4090" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
+        with:
+          name: ${{ github.sha }}_core_crypto
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ !success() && !cancelled() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  remove_github_label:
+    name: Remove 4090 bench label
+    if: ${{ always() && github.event_name == 'pull_request' }}
+    needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
+        with:
+          labels: 4090_bench
+          github_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/hyperstack_tfhe_gpu_tests.yml
+++ b/.github/workflows/hyperstack_tfhe_gpu_tests.yml
@@ -0,0 +1,160 @@
+# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
+name: TFHE Cuda Backend - Full tests on H100
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-h100-tests)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-tests-linux:
+    name: CUDA H100 tests
+    needs: [ setup-instance ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
+    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo \
+          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+           $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Run core crypto, integer and internal CUDA backend tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
+
+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-h100-tests)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -25,6 +25,8 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -44,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -54,14 +56,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer
+          make FAST_BENCH=TRUE bench_integer

      - name: Parse benchmarks to csv
        run: |
@@ -69,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,17 +91,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -119,11 +120,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -28,6 +28,8 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  prepare-matrix:
@@ -39,17 +41,17 @@ jobs:
      - name: Weekly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"

      - name: Quarterly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"

      -  name: Set operation flavor output
         id: set_op_flavor
         run: |
-          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> ${GITHUB_OUTPUT}
+          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

  integer-benchmarks:
    name: Execute integer benchmarks for all operations flavor
@@ -72,15 +74,17 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -88,21 +92,20 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}

      - name: Parse results
        run: |
@@ -118,7 +121,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -145,11 +148,11 @@ jobs:
    steps:
      - name: Notify
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -1,36 +1,47 @@
-# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
 name: Integer GPU benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
+  push:
+    branches:
+      - main

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-integer-benchmarks:
-    name: Execute integer benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  setup-instance:
+    name: Setup instance (cuda-integer-benchmarks)
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-integer-benchmarks:
+    name: Execute GPU integer benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -38,57 +49,78 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
    steps:
-      - name: Instance configuration used
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          sudo apt update
+          sudo apt install ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo \
+          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
+          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu

      - name: Parse benchmarks to csv
        run: |
@@ -96,39 +128,37 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "n3-H100x1" \
          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -144,14 +174,39 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-benchmarks ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ needs.cuda-integer-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -1,67 +1,96 @@
-# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+# Run all integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
 name: Integer GPU full benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  integer-benchmarks:
-    name: Execute integer benchmarks for all operations flavor
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  setup-instance:
+    name: Setup instance (cuda-integer-full-benchmarks)
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-integer-full-benchmarks:
+    name: Execute GPU integer benchmarks for all operations flavor
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
    continue-on-error: true
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
-        command: [ integer, integer_multi_bit]
-        op_flavor: [ default, unchecked ]
+        command: [integer, integer_multi_bit]
+        op_flavor: [default, unchecked]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 9
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
    steps:
-      - name: Instance configuration used
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
+          sudo apt update
+          sudo apt install ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo \
+          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -69,44 +98,46 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu

      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "n3-H100x1" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
@@ -117,7 +148,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -136,19 +167,39 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

-  slack-notification:
+  slack-notify:
    name: Slack Notification
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ failure() }}
-    needs: integer-benchmarks
+    needs: [ setup-instance, cuda-integer-full-benchmarks ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
    steps:
-      - name: Notify
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-full-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ needs.cuda-integer-full-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-full-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -25,6 +25,8 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -44,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -54,14 +56,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer_multi_bit
+          make FAST_BENCH=TRUE bench_integer_multi_bit

      - name: Parse benchmarks to csv
        run: |
@@ -69,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,17 +91,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -119,11 +120,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -1,95 +1,127 @@
-# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
-name: Integer Multi-bit benchmarks
+# Run integer benchmarks with multi-bit cryptographic parameters on an instance and return parsed results to Slab CI bot.
+name: Integer GPU Multi-bit benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 1a.m.
+    - cron: '0 1 * * 6'

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 jobs:
-  run-integer-benchmarks:
-    name: Execute integer multi-bit benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
+  setup-instance:
+    name: Setup instance (cuda-integer-multi-bit-benchmarks)
+    runs-on: ubuntu-latest
+    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: single-h100
+
+  cuda-integer-multi-bit-benchmarks:
+    name: Execute GPU integer multi-bit benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440 # 24 hours
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "11.8"
-            cuda_arch: "70"
-            gcc: 9
+            cuda: "12.2"
+            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.1
    steps:
-      - name: Instance configuration used
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          sudo apt update
+          sudo apt install ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo \
+          "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+          $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+          sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"

      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
+          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu

      - name: Parse benchmarks to csv
        run: |
@@ -97,39 +129,37 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
+          --hardware "n3-H100x1" \
          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
+          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
+          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512 \
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -145,14 +175,40 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    if: ${{ !success() && !cancelled() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}
+          SLACK_MESSAGE: "Integer GPU multi-bit benchmarks finished with status: ${{ needs.cuda-integer-multi-bit-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-integer-full-benchmarks)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, cuda-integer-multi-bit-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          SLACK_MESSAGE: "Instance teardown (cuda-integer-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -3,7 +3,7 @@ name: Tests on M1 CPU
 on:
  workflow_dispatch:
  pull_request:
-    types: [labeled]
+    types: [ labeled ]
  # Have a nightly build for M1 tests
  schedule:
    # * is a special character in YAML so you have to quote this string
@@ -14,6 +14,8 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  FAST_TESTS: "TRUE"

@@ -25,15 +27,18 @@ jobs:
  cargo-builds:
    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
    runs-on: ["self-hosted", "m1mac"]
+    # 12 hours, default is 6 hours, hopefully this is more than enough
+    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          persist-credentials: 'false'

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: stable
-          default: true

      - name: Run pcc checks
        run: |
@@ -71,6 +76,10 @@ jobs:
        run: |
          make test_concrete_csprng

+      - name: Run tfhe-zk-pok tests
+        run: |
+          make test_zk_pok
+
      - name: Run core tests
        run: |
          make test_core_crypto
@@ -79,6 +88,13 @@ jobs:
        run: |
          make test_boolean

+      # Because we do "illegal" things with the build system which Cargo does not seem to like much
+      # we need to clear the cache to make sure the C API is built properly and does not use a stale
+      # cached version
+      - name: Clear build cache
+        run: |
+          cargo clean
+
      - name: Run C API tests
        run: |
          make test_c_api
@@ -130,7 +146,7 @@ jobs:
      - name: Slack Notification
        if: ${{ needs.cargo-builds.result != 'skipped' }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ needs.cargo-builds.result }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -30,7 +30,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -49,7 +49,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
+        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
@@ -65,7 +65,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
+        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
@@ -74,7 +74,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -32,7 +32,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -0,0 +1,129 @@
+# Publish new release of tfhe-cuda-backend on crates.io.
+name: Publish CUDA release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+      push_to_crates:
+        description: "Push to crate"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (publish-cuda-release)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: gpu-test
+
+  publish-cuda-release:
+    name: Publish CUDA Release
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Publish crate.io package
+        if: ${{ inputs.push_to_crates }}
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          cargo publish -p tfhe-cuda-backend --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (publish-release)
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, publish-cuda-release ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -0,0 +1,42 @@
+# Publish new release of tfhe-zk-pok on crates.io.
+name: Publish tfhe-zk-pok release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  publish_release:
+    name: Publish tfhe-zk-pok Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
+        with:
+          fetch-depth: 0
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          cargo publish -p tfhe-zk-pok --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "tfhe-zk-pok release failed: (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -17,13 +17,14 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29

      - name: Checkout lattice-estimator
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
+          ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'

      - name: Install Sage
        run: |
@@ -41,7 +42,7 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -24,6 +24,8 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-shortint-benchmarks:
@@ -43,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -53,14 +55,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON bench_shortint
+          make bench_shortint

      - name: Parse results
        run: |
@@ -88,17 +89,17 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_shortint
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -117,11 +118,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Shortint benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Shortint benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -32,6 +32,8 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  shortint-benchmarks:
@@ -51,15 +53,17 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -67,21 +71,20 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint

      - name: Parse results
        run: |
@@ -112,7 +115,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -139,11 +142,11 @@ jobs:
    steps:
      - name: Notify
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Shortint full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -25,6 +25,8 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -44,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -54,14 +56,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer
+          make FAST_BENCH=TRUE bench_signed_integer

      - name: Parse benchmarks to csv
        run: |
@@ -69,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,17 +91,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -119,11 +120,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -28,6 +28,8 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  integer-benchmarks:
@@ -50,15 +52,17 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -66,21 +70,20 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}

      - name: Parse results
        run: |
@@ -96,7 +99,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -123,11 +126,11 @@ jobs:
    steps:
      - name: Notify
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -25,6 +25,8 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -44,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -54,14 +56,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer_multi_bit
+          make FAST_BENCH=TRUE bench_signed_integer_multi_bit

      - name: Parse benchmarks to csv
        run: |
@@ -69,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,17 +91,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -119,11 +120,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -32,8 +32,8 @@ on:
        description: "Run signed integer multi bit benches"
        type: boolean
        default: true
-      pbs_bench:
-        description: "Run PBS benches"
+      core_crypto_bench:
+        description: "Run core crypto benches"
        type: boolean
        default: true
      wasm_client_bench:
@@ -49,18 +49,17 @@ jobs:
        command: [ boolean_bench, shortint_bench,
                   integer_bench, integer_multi_bit_bench,
                   signed_integer_bench, signed_integer_multi_bit_bench,
-                   integer_gpu_bench, integer_multi_bit_gpu_bench,
-                   pbs_bench, wasm_client_bench ]
+                   core_crypto_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@62f4729b5df35e6e0e01265fa70a82ccaf196b4b
+        uses: tj-actions/changed-files@03334d095e2739fa9ac4034ec16f66d5d01e9eba
        with:
          files_yaml: |
            common_benches:
@@ -98,20 +97,20 @@ jobs:
              - tfhe/src/integer/**
              - tfhe/benches/integer/signed_bench.rs
              - .github/workflows/signed_integer_multi_bit_benchmark.yml
-            pbs_bench:
+            core_crypto_bench:
              - tfhe/src/core_crypto/**
              - tfhe/benches/core_crypto/**
-              - .github/workflows/pbs_benchmark.yml
+              - .github/workflows/core_crypto_benchmark.yml
            wasm_client_bench:
              - tfhe/web_wasm_parallel_tests/**
              - .github/workflows/wasm_client_benchmark.yml

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Start AWS job in Slab
        # If manually triggered check that the current bench has been requested
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -24,21 +24,22 @@ jobs:
    if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
    strategy:
      matrix:
-        command: [ boolean_bench, shortint_full_bench, integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
-                   pbs_bench, wasm_client_bench ]
+        command: [ boolean_bench, shortint_full_bench,
+                   integer_full_bench, signed_integer_full_bench,
+                   core_crypto_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set benchmarks type as weekly
        if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'weekly') || github.event.schedule == '0 1 * * 6'
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -13,25 +13,20 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0
-      - name: Save repo
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
-        with:
-          name: repo-archive
-          path: '.'
      - name: git-sync
        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "main"
-          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.CONCRETE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
+          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
          destination_branch: "main"
      - name: git-sync tags
        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "refs/tags/*"
-          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.CONCRETE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
+          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
          destination_branch: "refs/tags/*"
--- a/.github/workflows/trigger_aws_tests_on_pr.yml
+++ b/.github/workflows/trigger_aws_tests_on_pr.yml
@@ -1,55 +0,0 @@
-# Trigger an AWS build each time commits are pushed to a pull request.
-name: PR AWS build trigger
-
-on:
-  pull_request:
-  pull_request_review:
-    types: [submitted]
-
-jobs:
-  trigger-tests:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: write
-    steps:
-      - name: Get current labels
-        uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
-
-      - name: Remove approved label
-        if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
-        uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          labels: approved
-
-      - name: Launch fast tests
-        if: ${{ github.event_name == 'pull_request' }}
-        uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
-        with:
-          allow-repeats: true
-          message: |
-            @slab-ci cpu_fast_test
-            @slab-ci gpu_test
-
-      - name: Add approved label
-        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
-        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          labels: approved
-
-      # PR label 'approved' presence is checked to avoid running the full test suite several times
-      # in case of multiple approvals without new commits in between.
-      - name: Launch full tests suite
-        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
-        uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
-        with:
-          allow-repeats: true
-          message: |
-            Pull Request has been approved :tada:
-            Launching full test suite...
-            @slab-ci cpu_test
-            @slab-ci cpu_unsigned_integer_test
-            @slab-ci cpu_signed_integer_test
-            @slab-ci cpu_wasm_test
-            @slab-ci csprng_randomness_testing
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -32,6 +32,8 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-wasm-client-benchmarks:
@@ -51,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          fetch-depth: 0

@@ -61,10 +63,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@d8352f6b1d2e870bc5716e7a6d9b65c4cc244a1a
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks
        run: |
@@ -97,17 +98,17 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -126,11 +127,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "WASM benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "WASM benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,6 @@ dieharder_run.log

 # Coverage reports
 /coverage/
+
+# Cuda local build
+backends/tfhe-cuda-backend/cuda/cmake-build-debug/
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,13 @@
 [workspace]
 resolver = "2"
-members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng"]
+members = [
+    "tfhe",
+    "tfhe-zk-pok",
+    "tasks",
+    "apps/trivium",
+    "concrete-csprng",
+    "backends/tfhe-cuda-backend",
+]

 [profile.bench]
 lto = "fat"
@@ -17,3 +24,4 @@ lto = "off"
 inherits = "dev"
 opt-level = 3
 lto = "off"
+debug-assertions = false
--- a/318
+++ b/318
@@ -3,6 +3,7 @@ OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
 TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
+CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
 CARGO_PROFILE?=release
@@ -17,6 +18,7 @@ FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
 NODE_VERSION=20
+FORWARD_COMPAT?=OFF
 # sed: -n, do not print input stream, -e means a script/expression
 # 1,/version/ indicates from the first line, to the line matching version at the start of the line
 # p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
@@ -49,12 +51,18 @@ else
 		COVERAGE_ONLY=
 endif

+ifeq ($(FORWARD_COMPAT),ON)
+		FORWARD_COMPAT_FEATURE=forward_compatibility
+else
+		FORWARD_COMPAT_FEATURE=
+endif
+
 # Variables used only for regex_engine example
 REGEX_STRING?=''
 REGEX_PATTERN?=''

 # tfhe-cuda-backend
-TFHECUDA_SRC="backends/tfhe-cuda-backend/implementation"
+TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

 # Exclude these files from coverage reports
@@ -112,7 +120,12 @@ install_wasm_pack: install_rs_build_toolchain

 .PHONY: install_node # Install last version of NodeJS via nvm
 install_node:
-	curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | $(SHELL)
+	curl -o nvm_install.sh https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh
+	@echo "2ed5e94ba12434370f0358800deb69f514e8bce90f13beb0e1b241d42c6abafd nvm_install.sh" > nvm_checksum
+	@sha256sum -c nvm_checksum
+	@rm nvm_checksum
+	$(SHELL) nvm_install.sh
+	@rm nvm_install.sh
 	source ~/.bashrc
 	$(SHELL) -i -c 'nvm install $(NODE_VERSION)' || \
 	( echo "Unable to install node, unknown error." && exit 1 )
@@ -137,24 +150,68 @@ check_linelint_installed:
 	@printf "\n" | linelint - > /dev/null 2>&1 || \
 	( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )

+.PHONY: check_actionlint_installed # Check if actionlint workflow linter is installed
+check_actionlint_installed:
+	@actionlint --version > /dev/null 2>&1 || \
+	( echo "Unable to locate actionlint. Try installing it: https://github.com/rhysd/actionlint/releases" && exit 1 )
+
+.PHONY: check_nvm_installed # Check if Node Version Manager is installed
+check_nvm_installed:
+	@source ~/.nvm/nvm.sh && nvm --version > /dev/null 2>&1 || \
+	( echo "Unable to locate Node. Run 'make install_node'" && exit 1 )
+
+.PHONY: install_mlc # Install mlc (Markup Link Checker)
+install_mlc: install_rs_build_toolchain
+	@mlc --version > /dev/null 2>&1 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install mlc --locked || \
+	( echo "Unable to install mlc, unknown error." && exit 1 )
+
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt

+.PHONY: fmt_js # Format javascript code
+fmt_js: check_nvm_installed
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	$(MAKE) -C tfhe/web_wasm_parallel_tests fmt
+
 .PHONY: fmt_gpu # Format rust and cuda code
 fmt_gpu: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
 	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh

+.PHONY: fmt_c_tests # Format c tests
+fmt_c_tests:
+	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format -style=file -i {} \;
+
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check

-.PHONY: clippy_gpu # Run clippy lints on the gpu backend
+.PHONY: check_fmt_c_tests  # Check C tests format
+check_fmt_c_tests:
+	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
+
+.PHONY: check_fmt_gpu # Check rust and cuda code format
+check_fmt_gpu: install_rs_check_toolchain
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
+	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
+
+.PHONY: check_fmt_js # Check javascript code format
+check_fmt_js: check_nvm_installed
+	source ~/.nvm/nvm.sh && \
+	nvm install $(NODE_VERSION) && \
+	nvm use $(NODE_VERSION) && \
+	$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt
+
+.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
-		-p tfhe -- --no-deps -D warnings
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
+		--all-targets \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
@@ -164,6 +221,10 @@ fix_newline: check_linelint_installed
 check_newline: check_linelint_installed
 	linelint .

+.PHONY: lint_workflow # Run static linter on GitHub workflows
+lint_workflow: check_actionlint_installed
+	actionlint
+
 .PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -172,6 +233,12 @@ clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
@@ -200,7 +267,7 @@ clippy: install_rs_check_toolchain
 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
 clippy_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
@@ -210,35 +277,45 @@ clippy_js_wasm_api: install_rs_check_toolchain
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
-clippy_tasks:
+clippy_tasks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		-p tasks -- --no-deps -D warnings

 .PHONY: clippy_trivium # Run clippy lints on Trivium app
 clippy_trivium: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-trivium -- --no-deps -D warnings

 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
-clippy_all_targets:
+clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
-clippy_concrete_csprng:
+clippy_concrete_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE) \
 		-p concrete-csprng -- --no-deps -D warnings

+.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
+clippy_zk_pok: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-zk-pok -- --no-deps -D warnings
+
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets clippy_c_api \
-clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_trivium
+clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
 clippy_concrete_csprng

+.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
+clippy_cuda_backend: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-cuda-backend -- --no-deps -D warnings
+
 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
@@ -277,6 +354,11 @@ build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets

+.PHONY: build_tfhe_coverage # Build with test coverage enabled
+build_tfhe_coverage: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
+
 .PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
 symlink_c_libs_without_fingerprint:
 	@./scripts/symlink_c_libs_without_fingerprint.sh \
@@ -286,22 +368,30 @@ symlink_c_libs_without_fingerprint:
 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,$(FORWARD_COMPAT_FEATURE) \
+		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint
+
+.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
+build_c_api_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,gpu \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4 \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_rs_build_toolchain install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental

 .PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
 build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
@@ -309,7 +399,7 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
 	rustup component add rust-src --toolchain $(RS_CHECK_TOOLCHAIN) && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
 		wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api \
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok-experimental \
 		-Z build-std=panic_abort,std

 .PHONY: build_node_js_api # Build the js API targeting nodejs
@@ -317,7 +407,7 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=nodejs \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental

 .PHONY: build_concrete_csprng # Build concrete_csprng
 build_concrete_csprng: install_rs_build_toolchain
@@ -327,10 +417,10 @@ build_concrete_csprng: install_rs_build_toolchain
 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC) -- core_crypto::
+		--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental -p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
 	fi

 .PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -338,32 +428,40 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
 		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage \
+		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
 		-p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
 			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage,$(AVX512_FEATURE) \
-			-p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
+			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
 	fi

+.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
+test_cuda_backend:
+	mkdir -p "$(TFHECUDA_BUILD)" && \
+		cd "$(TFHECUDA_BUILD)" && \
+		cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
+		make -j "$(CPU_COUNT)" && \
+		make test
+
 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
+test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
+test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
@@ -375,8 +473,8 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
-		-p $(TFHE_SPEC) -- boolean::
+		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
+		-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::

 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
@@ -387,24 +485,28 @@ test_c_api_rs: install_rs_check_toolchain

 .PHONY: test_c_api_c # Run the C tests for the C API
 test_c_api_c: build_c_api
-	./scripts/c_api_tests.sh
+	./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"

 .PHONY: test_c_api # Run all the tests for the C API
 test_c_api: test_c_api_rs test_c_api_c

+.PHONY: test_c_api_gpu # Run the C tests for the C API
+test_c_api_gpu: build_c_api_gpu
+	./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"
+
 .PHONY: test_shortint_ci # Run the tests for shortint ci
 test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)"
+		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
 test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_shortint # Run all the tests for shortint
 test_shortint: install_rs_build_toolchain
@@ -416,15 +518,16 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
-		-p $(TFHE_SPEC) -- shortint::
+		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
+		-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::

 .PHONY: test_integer_ci # Run the tests for integer ci
 test_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)"
+		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_unsigned_integer_ci # Run the tests for unsigned integer ci
 test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -432,7 +535,7 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only
+		--unsigned-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_signed_integer_ci # Run the tests for signed integer ci
 test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -440,14 +543,15 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only
+		--signed-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_integer_multi_bit_ci # Run the tests for integer ci running only multibit tests
 test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)"
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
 test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -455,7 +559,7 @@ test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nex
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only
+		--unsigned-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_signed_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
 test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -463,7 +567,7 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only
+		--signed-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_safe_deserialization # Run the tests for safe deserialization
 test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
@@ -475,18 +579,45 @@ test_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::

+.PHONY: test_integer_cov # Run the tests of the integer module with code coverage
+test_integer_cov: install_rs_check_toolchain install_tarpaulin
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
+		--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
+		--implicit-test-threads \
+		--exclude-files $(COVERAGE_EXCLUDED_FILES) \
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
+		-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
+
 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental -p $(TFHE_SPEC) \
 		-- high_level_api::

+test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*gpu.*/)"
+
 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok-experimental \
+		-p $(TFHE_SPEC) \
 		-- test_user_docs::

+.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
+test_user_doc_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok-experimental -p $(TFHE_SPEC) \
+		-- test_user_docs::
+
+.PHONY: test_fhe_strings # Run tests for fhe_strings example
+test_fhe_strings: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--example fhe_strings \
+		--features=$(TARGET_ARCH_FEATURE),integer
+
 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -513,48 +644,79 @@ test_kreyvium: install_rs_build_toolchain
 		-p tfhe-trivium -- --test-threads=1 kreyvium::

 .PHONY: test_concrete_csprng # Run concrete-csprng tests
-test_concrete_csprng:
+test_concrete_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng

+.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
+test_zk_pok: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		-p tfhe-zk-pok
+
 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental --no-deps -p $(TFHE_SPEC)

 .PHONY: docs # Build rust doc alias for doc
 docs: doc

 .PHONY: lint_doc # Build rust doc with linting enabled
 lint_doc: install_rs_check_toolchain
+	@# Even though we are not in docs.rs, this allows to "just" build the doc
+	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p tfhe --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc

 .PHONY: format_doc_latex # Format the documentation latex equations to avoid broken rendering.
 format_doc_latex:
-	cargo xtask format_latex_doc
+	RUSTFLAGS="" cargo xtask format_latex_doc
 	@"$(MAKE)" --no-print-directory fmt
 	@printf "\n===============================\n\n"
 	@printf "Please manually inspect changes made by format_latex_doc, rustfmt can break equations \
 	if the line length is exceeded\n"
 	@printf "\n===============================\n"

+.PHONY: check_md_docs_are_tested # Checks that the rust codeblocks in our .md files are tested
+check_md_docs_are_tested:
+	RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
+
+.PHONY: check_intra_md_links # Checks broken internal links in Markdown docs
+check_intra_md_links: install_mlc
+	mlc --offline --match-file-extension tfhe/docs
+
+.PHONY: check_md_links # Checks all broken links in Markdown docs
+check_md_links: install_mlc
+	mlc --match-file-extension tfhe/docs
+
 .PHONY: check_compile_tests # Build tests in debug without running them
-check_compile_tests:
+check_compile_tests: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
 		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC)

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
 		"$(MAKE)" build_c_api && \
-		./scripts/c_api_tests.sh --build-only; \
+		./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
 	fi

+.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
+check_compile_tests_benches_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
+		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
+		-p $(TFHE_SPEC)
+	mkdir -p "$(TFHECUDA_BUILD)" && \
+		cd "$(TFHECUDA_BUILD)" && \
+		cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
+		make -j "$(CPU_COUNT)"
+
 .PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
 build_nodejs_test_docker:
 	DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
@@ -607,21 +769,21 @@ bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -629,7 +791,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -637,7 +799,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -645,27 +807,25 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
 bench_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_oprf # Run benchmarks for shortint
 bench_oprf: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p tfhe
-
-
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
@@ -673,20 +833,37 @@ bench_shortint_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
-
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench boolean-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_pbs # Run benchmarks for PBS
 bench_pbs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+
+.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
+bench_pbs_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs-bench \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+
+.PHONY: bench_ks # Run benchmarks for keyswitch
+bench_ks: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench ks-bench \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+
+.PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
+bench_ks_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench ks-bench \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
 bench_web_js_api_parallel: build_web_js_api_parallel
@@ -701,9 +878,10 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
 #
 # Utility tools
 #
+
 .PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
 gen_key_cache: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
+	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 		--example generates_test_keys \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
 		$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
@@ -776,13 +954,15 @@ sha256_bool: install_rs_check_toolchain
 	--features=$(TARGET_ARCH_FEATURE),boolean

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
-pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests
+pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
+clippy_all check_compile_tests

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
-pcc_gpu: pcc clippy_gpu
+pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
-fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests
+fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_fast \
+check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt
--- a/README.md
+++ b/README.md
@@ -1,37 +1,71 @@
 <p align="center">
 <!-- product name logo -->
-  <img width=600 src="https://user-images.githubusercontent.com/5758427/231206749-8f146b97-3c5a-4201-8388-3ffa88580415.png">
-</p>
-<hr/>
-<p align="center">
-  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a>
-</p>
-<p align="center">
-<!-- Version badge using shields.io -->
-  <a href="https://github.com/zama-ai/tfhe-rs/releases">
-    <img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
-  </a>
-<!-- Zama Bounty Program -->
-  <a href="https://github.com/zama-ai/bounty-program">
-    <img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
-  </a>
+<picture>
+  <source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/5283e0ba-da1e-43af-9f2a-c5221367a12b">
+  <source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/b94a8c96-7595-400b-9311-70765c706955">
+  <img width=600 alt="Zama TFHE-rs">
+</picture>
 </p>
+
 <hr/>

+<p align="center">
+  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
+</p>

-**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer
-arithmetics over encrypted data. It includes:
- - a **Rust** API
- - a **C** API
- - and a **client-side WASM** API

-**TFHE-rs** is meant for developers and researchers who want full control over
-what they can do with TFHE, while not having to worry about the low level
+<p align="center">
+  <a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
+  <a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
+  <a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
+</p>
+
+## About
+
+### What is TFHE-rs
+
+**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer arithmetics over encrypted data.
+
+It includes:
+- a **Rust** API
+- a **C** API
+- and a **client-side WASM** API
+
+TFHE-rs is designed for developers and researchers who want full control over
+what they can do with TFHE, while not having to worry about the low-level
 implementation. The goal is to have a stable, simple, high-performance, and
 production-ready library for all the advanced features of TFHE.
+<br></br>
+
+### Main features
+
+- **Low-level cryptographic library** that implements Zama’s variant of TFHE, including programmable bootstrapping
+- **Implementation of the original TFHE boolean API** that can be used as a drop-in replacement for other TFHE libraries
+- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
+- **Size-efficient public key encryption**
+- **Ciphertext and server key compression** for efficient data transfer
+- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
+
+*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
+<br></br>
+
+## Table of Contents
+- **[Getting Started](#getting-started)**
+   - [Cargo.toml configuration](#cargotoml-configuration)
+   - [A simple example](#a-simple-example)
+- **[Resources](#resources)**
+   - [TFHE deep dive](#tfhe-deep-dive)
+   - [Tutorials](#tutorials)
+   - [Documentation](#documentation)
+- **[Working with TFHE-rs](#working-with-tfhe-rs)**
+   - [Disclaimers](#disclaimers)
+   - [Citations](#citations)
+   - [Contributing](#contributing)
+   - [License](#license)
+- **[Support](#support)**
+<br></br>

 ## Getting Started
-The steps to run a first example are described below. 

 ### Cargo.toml configuration
 To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
@@ -47,20 +81,24 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-un
 ```toml
 tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
 ```
-Note: users with ARM devices must compile `TFHE-rs` using a stable toolchain with version >= 1.72.

-
-+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) 
-running Windows:
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:

 ```toml
 tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
 ```

-Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
+> [!Note]
+> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.

+> [!Note]
+> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.

-## A simple example
+<p align="right">
+  <a href="#about" > ↑ Back to top </a> 
+</p>
+
+### A simple example

 Here is a full example:

@@ -93,13 +131,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Clear equivalent computations: 1344 * 5 = 6720
    let encrypted_res_mul = &encrypted_a * &encrypted_b;

-    // Clear equivalent computations: 1344 >> 5 = 42
+    // Clear equivalent computations: 6720 >> 5 = 210
    encrypted_a = &encrypted_res_mul >> &encrypted_b;

    // Clear equivalent computations: let casted_a = a as u8;
    let casted_a: FheUint8 = encrypted_a.cast_into();

-    // Clear equivalent computations: min(42, 7) = 7
+    // Clear equivalent computations: min(210, 7) = 7
    let encrypted_res_min = &casted_a.min(&encrypted_c);

    // Operation between clear and encrypted data:
@@ -117,32 +155,70 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 To run this code, use the following command: 
 <p align="center"> <code> cargo run --release </code> </p>

-Note that when running code that uses `tfhe-rs`, it is highly recommended
+> [!Note]
+> Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*

-## Contributing
+<p align="right">
+  <a href="#about" > ↑ Back to top </a> 
+</p>

-There are two ways to contribute to TFHE-rs:

- you can open issues to report bugs or typos, or to suggest new ideas
- you can ask to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
-(becoming an approved contributor involves signing our Contributor License Agreement (CLA))

-Only approved contributors can send pull requests, so please make sure to get in touch before you do!
+## Resources 

-## Credits
+### TFHE deep dive
+- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
+- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
+- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
+- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
+<br></br>

-This library uses several dependencies and we would like to thank the contributors of those
-libraries.
+### Tutorials
+- [[Video tutorial] Implement signed integers using TFHE-rs ](https://www.zama.ai/post/video-tutorial-implement-signed-integers-ssing-tfhe-rs)
+- [Homomorphic parity bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
+- [Homomorphic case changing on Ascii string](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
+- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
+- [Dark market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
+- [Regular expression engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)

-## Need support?
-<a target="_blank" href="https://community.zama.ai">
-  <img src="https://user-images.githubusercontent.com/5758427/231115030-21195b55-2629-4c01-9809-be5059243999.png">
-</a>
+*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
+<br></br>
+### Documentation

-## Citing TFHE-rs
+Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
+<p align="right">
+  <a href="#about" > ↑ Back to top </a> 
+</p>

+
+## Working with TFHE-rs
+
+### Disclaimers
+
+#### Security Estimation
+
+Security estimations are done using the
+[Lattice Estimator](https://github.com/malb/lattice-estimator)
+with `red_cost_model = reduction.RC.BDGL16`.
+
+When a new update is published in the Lattice Estimator, we update parameters accordingly.
+
+### Security Model
+
+The default parameters for the TFHE-rs library are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at p_error = $2^{-40}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [1]. 
+
+[1] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf
+
+#### Side-Channel Attacks
+
+Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
+and will be released in upcoming versions.
+<br></br>
+
+### Citations
 To cite TFHE-rs in academic papers, please use the following entry:

 ```text
@@ -154,22 +230,35 @@ To cite TFHE-rs in academic papers, please use the following entry:
 }
 ```

-## License
+### Contributing

-This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
-please contact us at `hello@zama.ai`.
+There are two ways to contribute to TFHE-rs:

-## Disclaimers
+- [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
+- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).

-### Security Estimation
+Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
+<br></br>

-Security estimations are done using the
-[Lattice Estimator](https://github.com/malb/lattice-estimator)
-with `red_cost_model = reduction.RC.BDGL16`.
+### License
+This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
+<p align="right">
+  <a href="#about" > ↑ Back to top </a> 
+</p>

-When a new update is published in the Lattice Estimator, we update parameters accordingly.

-### Side-Channel Attacks
+## Support

-Mitigation for side channel attacks have not yet been implemented in TFHE-rs,
-and will be released in upcoming versions.
+<a target="_blank" href="https://community.zama.ai">
+<picture>
+  <source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/08656d0a-3f44-4126-b8b6-8c601dff5380">
+  <source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/1c9c9308-50ac-4aab-a4b9-469bb8c536a4">
+  <img alt="Support">
+</picture>
+</a>
+
+🌟 If you find this project helpful or interesting, please consider giving it a star on GitHub! Your support helps to grow the community and motivates further development. 
+
+<p align="right">
+  <a href="#about" > ↑ Back to top </a> 
+</p>
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -15,7 +15,6 @@ Example of a Rust main below:
 ```rust
 use tfhe::{ConfigBuilder, generate_keys, FheBool};
 use tfhe::prelude::*;
-
 use tfhe_trivium::TriviumStream;

 fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
@@ -72,7 +71,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
 }

 fn main() {
-	let config = ConfigBuilder::all_disabled().enable_default_bool().build();
+	let config = ConfigBuilder::default().build();
 	let (client_key, server_key) = generate_keys(config);

 	let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -139,14 +138,12 @@ Example code:
 ```rust
 use tfhe::shortint::prelude::*;
 use tfhe::shortint::CastingKey;
-
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
-
 use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
-	let config = ConfigBuilder::all_disabled().enable_default_integers().build();
+	let config = ConfigBuilder::default().build();
 	let (hl_client_key, hl_server_key) = generate_keys(config);
 	let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
 	let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));
--- a/apps/trivium/benches/kreyvium_bool.rs
+++ b/apps/trivium/benches/kreyvium_bool.rs
@@ -1,10 +1,8 @@
+use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool};
-
 use tfhe_trivium::KreyviumStream;

-use criterion::Criterion;
-
 pub fn kreyvium_bool_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (client_key, server_key) = generate_keys(config);
--- a/apps/trivium/benches/kreyvium_byte.rs
+++ b/apps/trivium/benches/kreyvium_byte.rs
@@ -1,10 +1,8 @@
+use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
-
 use tfhe_trivium::{KreyviumStreamByte, TransCiphering};

-use criterion::Criterion;
-
 pub fn kreyvium_byte_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
        .enable_function_evaluation()
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,12 +1,9 @@
+use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::KeySwitchingKey;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
-
 use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

-use criterion::Criterion;
-
 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
--- a/apps/trivium/benches/trivium_bool.rs
+++ b/apps/trivium/benches/trivium_bool.rs
@@ -1,10 +1,8 @@
+use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool};
-
 use tfhe_trivium::TriviumStream;

-use criterion::Criterion;
-
 pub fn trivium_bool_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (client_key, server_key) = generate_keys(config);
--- a/apps/trivium/benches/trivium_byte.rs
+++ b/apps/trivium/benches/trivium_byte.rs
@@ -1,10 +1,8 @@
+use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
-
 use tfhe_trivium::{TransCiphering, TriviumStreamByte};

-use criterion::Criterion;
-
 pub fn trivium_byte_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (client_key, server_key) = generate_keys(config);
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,12 +1,9 @@
+use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::KeySwitchingKey;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
-
 use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

-use criterion::Criterion;
-
 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
--- a/apps/trivium/src/kreyvium/kreyvium.rs
+++ b/apps/trivium/src/kreyvium/kreyvium.rs
@@ -2,12 +2,10 @@
 //! for the representation of the inner bits.

 use crate::static_deque::StaticDeque;
-
+use rayon::prelude::*;
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheBool, ServerKey};

-use rayon::prelude::*;
-
 /// Internal trait specifying which operations are necessary for KreyviumStream generic type
 pub trait KreyviumBoolInput<OpOutput>:
    Sized
--- a/apps/trivium/src/kreyvium/kreyvium_byte.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_byte.rs
@@ -2,12 +2,10 @@
 //! for the representation of the inner bits.

 use crate::static_deque::{StaticByteDeque, StaticByteDequeInput};
-
+use rayon::prelude::*;
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheUint8, ServerKey};

-use rayon::prelude::*;
-
 /// Internal trait specifying which operations are necessary for KreyviumStreamByte generic type
 pub trait KreyviumByteInput<OpOutput>:
    Sized
--- a/apps/trivium/src/kreyvium/kreyvium_shortint.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_shortint.rs
@@ -1,8 +1,6 @@
 use crate::static_deque::StaticDeque;
-
-use tfhe::shortint::prelude::*;
-
 use rayon::prelude::*;
+use tfhe::shortint::prelude::*;

 /// KreyviumStreamShortint: a struct implementing the Kreyvium stream cipher, using a generic
 /// Ciphertext for the internal representation of bits (intended to represent a single bit). To be
@@ -36,7 +34,7 @@ impl KreyviumStreamShortint {
        let mut c_register: [Ciphertext; 111] = [0; 111].map(|x| sk.create_trivial(x));

        for i in 0..93 {
-            a_register[i] = key[128 - 93 + i].clone();
+            a_register[i].clone_from(&key[128 - 93 + i]);
        }
        for i in 0..84 {
            b_register[i] = sk.create_trivial(iv[128 - 84 + i]);
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,8 +1,7 @@
+use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};

-use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
-
 // Values for these tests come from the github repo renaud1239/Kreyvium,
 // commit fd6828f68711276c25f55e605935028f5e843f43

--- a/apps/trivium/src/static_deque/mod.rs
+++ b/apps/trivium/src/static_deque/mod.rs
@@ -1,5 +1,6 @@
 #[allow(clippy::module_inception)]
 mod static_deque;
 pub use static_deque::StaticDeque;
+
 mod static_byte_deque;
 pub use static_byte_deque::{StaticByteDeque, StaticByteDequeInput};
--- a/apps/trivium/src/static_deque/static_byte_deque.rs
+++ b/apps/trivium/src/static_deque/static_byte_deque.rs
@@ -4,7 +4,6 @@
 //! This is pretending to store bits, and allows accessing bits in chunks of 8 consecutive.

 use crate::static_deque::StaticDeque;
-
 use tfhe::FheUint8;

 /// Internal trait specifying which operations are needed by StaticByteDeque
--- a/apps/trivium/src/trans_ciphering/mod.rs
+++ b/apps/trivium/src/trans_ciphering/mod.rs
@@ -2,12 +2,10 @@
 //! when trans ciphering is available to them.

 use crate::{KreyviumStreamByte, KreyviumStreamShortint, TriviumStreamByte, TriviumStreamShortint};
-use tfhe::shortint::Ciphertext;
-
-use tfhe::prelude::*;
-use tfhe::{set_server_key, unset_server_key, FheUint64, FheUint8, ServerKey};
-
 use rayon::prelude::*;
+use tfhe::prelude::*;
+use tfhe::shortint::Ciphertext;
+use tfhe::{set_server_key, unset_server_key, FheUint64, FheUint8, ServerKey};

 /// Triat specifying the interface for trans ciphering a FheUint64 object. Since it is meant
 /// to be used with stream ciphers, encryption and decryption are by default the same.
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,8 +1,7 @@
+use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};

-use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
-
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
 // file testvectors/trivium-80.80.test-vectors

--- a/apps/trivium/src/trivium/trivium_bool.rs
+++ b/apps/trivium/src/trivium/trivium_bool.rs
@@ -2,12 +2,10 @@
 //! for the representation of the inner bits.

 use crate::static_deque::StaticDeque;
-
+use rayon::prelude::*;
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheBool, ServerKey};

-use rayon::prelude::*;
-
 /// Internal trait specifying which operations are necessary for TriviumStream generic type
 pub trait TriviumBoolInput<OpOutput>:
    Sized
--- a/apps/trivium/src/trivium/trivium_byte.rs
+++ b/apps/trivium/src/trivium/trivium_byte.rs
@@ -2,12 +2,10 @@
 //! for the representation of the inner bits.

 use crate::static_deque::{StaticByteDeque, StaticByteDequeInput};
-
+use rayon::prelude::*;
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheUint8, ServerKey};

-use rayon::prelude::*;
-
 /// Internal trait specifying which operations are necessary for TriviumStreamByte generic type
 pub trait TriviumByteInput<OpOutput>:
    Sized
--- a/apps/trivium/src/trivium/trivium_shortint.rs
+++ b/apps/trivium/src/trivium/trivium_shortint.rs
@@ -1,8 +1,6 @@
 use crate::static_deque::StaticDeque;
-
-use tfhe::shortint::prelude::*;
-
 use rayon::prelude::*;
+use tfhe::shortint::prelude::*;

 /// TriviumStreamShortint: a struct implementing the Trivium stream cipher, using a generic
 /// Ciphertext for the internal representation of bits (intended to represent a single bit). To be
@@ -34,7 +32,7 @@ impl TriviumStreamShortint {
        let mut c_register: [Ciphertext; 111] = [0; 111].map(|x| sk.create_trivial(x));

        for i in 0..80 {
-            a_register[93 - 80 + i] = key[i].clone();
+            a_register[93 - 80 + i].clone_from(&key[i]);
            b_register[84 - 80 + i] = sk.create_trivial(iv[i]);
        }

--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.1.2"
+version = "0.2.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -13,6 +13,7 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

 [build-dependencies]
 cmake = { version = "0.1" }
+pkg-config = { version = "0.3" }

 [dependencies]
 thiserror = "1.0"
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -30,17 +30,17 @@ The cryptographic operations it provides are:

 ## Build

-The Cuda project held in `tfhe-cuda-backend` can be compiled independently from Concrete in the 
-following way:
+The Cuda project held in `tfhe-cuda-backend` can be compiled independently from TFHE-rs in the following way:
 ```
 git clone git@github.com:zama-ai/tfhe-rs
-cd backends/tfhe-cuda-backend/implementation
+cd backends/tfhe-cuda-backend/cuda
 mkdir build
 cd build
 cmake ..
 make
 ```
 The compute capability is detected automatically (with the first GPU information) and set accordingly.
+If your machine does not have an available Nvidia GPU, the compilation will work if you have the nvcc compiler installed. The generated executable will target a 7.0 compute capability (sm_70).

 ## Links

--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -2,6 +2,12 @@ use std::env;
 use std::process::Command;

 fn main() {
+    if let Ok(val) = env::var("DOCS_RS") {
+        if val.parse::<u32>() == Ok(1) {
+            return;
+        }
+    }
+
    println!("Build tfhe-cuda-backend");
    if env::consts::OS == "linux" {
        let output = Command::new("./get_os_name.sh").output().unwrap();
@@ -15,7 +21,15 @@ fn main() {
        let dest = cmake::build("cuda");
        println!("cargo:rustc-link-search=native={}", dest.display());
        println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
-        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+
+        // Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
+        if pkg_config::Config::new()
+            .atleast_version("10")
+            .probe("cuda")
+            .is_err()
+        {
+            println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        }
        println!("cargo:rustc-link-lib=gomp");
        println!("cargo:rustc-link-lib=cudart");
        println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
--- a/backends/tfhe-cuda-backend/cuda/.gitignore
+++ b/backends/tfhe-cuda-backend/cuda/.gitignore
@@ -0,0 +1,2 @@
+/build/
+include/cuda_config.h
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
-project(tfhe_cuda_backend LANGUAGES CXX CUDA)
+project(tfhe_cuda_backend LANGUAGES CXX)

 # See if the minimum CUDA version is available. If not, only enable documentation building.
 set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
@@ -56,11 +56,17 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -g")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
-set(CMAKE_CUDA_ARCHITECTURES native)
-if(NOT CUDA_NVCC_FLAGS)
-  set(CUDA_NVCC_FLAGS -arch=sm_70)
+if(${CUDA_SUCCESS})
+  set(CMAKE_CUDA_ARCHITECTURES native)
+  string(REPLACE "-arch=sm_" "" CUDA_ARCH "${ARCH}")
+  set(CUDA_ARCH "${CUDA_ARCH}0")
+else()
+  set(CMAKE_CUDA_ARCHITECTURES 70)
+  set(CUDA_ARCH "700")
 endif()

+add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
+
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
@@ -70,10 +76,13 @@ set(CMAKE_CUDA_FLAGS
 set(INCLUDE_DIR include)

 add_subdirectory(src)
+enable_testing()
+add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})

 # This is required for rust cargo build
 install(TARGETS tfhe_cuda_backend DESTINATION .)
+
 install(TARGETS tfhe_cuda_backend DESTINATION lib)

 # Define a function to add a lint target.
@@ -85,5 +94,3 @@ if(CPPLINT)
  set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
  # set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
 endif()
-
-enable_testing()
--- a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
+++ b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
@@ -1,6 +1,19 @@
-#!/bin/bash
+#!/usr/bin/env bash

-find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
+set -e
+
+while getopts ":c" option; do
+  case $option in
+    c)
+      # code to execute when flag1 is provided
+      find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
+      cmake-format -i CMakeLists.txt -c .cmake-format-config.py
+      find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
+      git diff --exit-code
+      exit
+      ;;
+  esac
+done
+find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
 cmake-format -i CMakeLists.txt -c .cmake-format-config.py
-
-find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
+find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
@@ -1,118 +0,0 @@
-#ifndef CUDA_BOOTSTRAP_H
-#define CUDA_BOOTSTRAP_H
-
-#include "device.h"
-#include <cstdint>
-
-enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
-
-extern "C" {
-void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
-                                 cuda_stream_t *stream,
-                                 uint32_t polynomial_size,
-                                 uint32_t total_polynomials);
-
-void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
-                                       cuda_stream_t *stream,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size);
-
-void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
-                                       cuda_stream_t *stream,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size);
-
-void scratch_cuda_bootstrap_amortized_32(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void scratch_cuda_bootstrap_amortized_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
-                                      int8_t **pbs_buffer);
-
-void scratch_cuda_bootstrap_low_latency_32(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_bootstrap_low_latency_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
-                                        int8_t **pbs_buffer);
-
-uint64_t get_buffer_size_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-
-uint64_t get_buffer_size_bootstrap_low_latency_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-}
-
-#ifdef __CUDACC__
-__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
-                                         int glwe_dimension,
-                                         uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
-
-#endif
-
-#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
@@ -1,46 +0,0 @@
-#ifndef CUDA_MULTI_BIT_H
-#define CUDA_MULTI_BIT_H
-
-#include <cstdint>
-
-extern "C" {
-void cuda_convert_lwe_multi_bit_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
-    uint32_t grouping_factor);
-
-void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t chunk_size = 0);
-
-void scratch_cuda_multi_bit_pbs_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t chunk_size = 0);
-
-void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
-}
-#ifdef __CUDACC__
-__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
-                                     uint32_t level_count,
-                                     uint32_t glwe_dimension,
-                                     uint32_t num_samples);
-
-__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
-                                             uint32_t level_count,
-                                             uint32_t glwe_dimension,
-                                             uint32_t ct_count);
-
-__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
-#endif
-
-#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -4,14 +4,14 @@
 #include <cstdint>

 extern "C" {
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  void *v_stream,
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
                                                  uint32_t number_of_cts,
                                                  uint32_t lwe_dimension);
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  void *v_stream,
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
                                                  uint32_t number_of_cts,
                                                  uint32_t lwe_dimension);
 };
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -8,81 +8,70 @@
 #include <cuda_runtime.h>

 #define synchronize_threads_in_block() __syncthreads()
-
 extern "C" {

-struct cuda_stream_t {
-  cudaStream_t stream;
-  uint32_t gpu_index;
-
-  cuda_stream_t(uint32_t gpu_index) {
-    this->gpu_index = gpu_index;
-
-    cudaStreamCreate(&stream);
-  }
-
-  void release() {
-    cudaSetDevice(gpu_index);
-    cudaStreamDestroy(stream);
-  }
-
-  void synchronize() { cudaStreamSynchronize(stream); }
-};
-
-cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
-
-int cuda_destroy_stream(cuda_stream_t *stream);
-
-void *cuda_malloc(uint64_t size, uint32_t gpu_index);
-
-void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
-
-int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
-
-int cuda_check_support_cooperative_groups();
-
-int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
-
-int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                             cuda_stream_t *stream);
-
-int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                 cuda_stream_t *stream);
-
-int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
-
-int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                             cuda_stream_t *stream);
-
-int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                      cuda_stream_t *stream);
-
-int cuda_get_number_of_gpus();
-
-int cuda_synchronize_device(uint32_t gpu_index);
-
-int cuda_drop(void *ptr, uint32_t gpu_index);
-
-int cuda_drop_async(void *ptr, cuda_stream_t *stream);
-
-int cuda_get_max_shared_memory(uint32_t gpu_index);
-
-int cuda_synchronize_stream(cuda_stream_t *stream);
-
 #define check_cuda_error(ans)                                                  \
  { cuda_error((ans), __FILE__, __LINE__); }
-inline void cuda_error(cudaError_t code, const char *file, int line,
-                       bool abort = true) {
+inline void cuda_error(cudaError_t code, const char *file, int line) {
  if (code != cudaSuccess) {
-    fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort)
-      exit(code);
+    std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
+                 file, line);
+    std::abort();
  }
 }
+#define PANIC(format, ...)                                                     \
+  {                                                                            \
+    std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
+                 __LINE__, __func__, ##__VA_ARGS__);                           \
+    std::abort();                                                              \
+  }
+
+cudaStream_t cuda_create_stream(uint32_t gpu_index);
+
+void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);
+
+void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);
+
+void *cuda_malloc(uint64_t size, uint32_t gpu_index);
+
+void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
+
+void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
+
+bool cuda_check_support_cooperative_groups();
+
+bool cuda_check_support_thread_block_clusters();
+
+void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+                              cudaStream_t stream, uint32_t gpu_index);
+
+void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index);
+
+void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                              cudaStream_t stream, uint32_t gpu_index);
+
+void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+                       cudaStream_t stream, uint32_t gpu_index);
+
+int cuda_get_number_of_gpus();
+
+void cuda_synchronize_device(uint32_t gpu_index);
+
+void cuda_drop(void *ptr, uint32_t gpu_index);
+
+void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+
+int cuda_get_max_shared_memory(uint32_t gpu_index);
+
+void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
+                              cudaStreamCallback_t callback, void *user_data);
 }

+void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
+                                  void *host_pointer);
+
 template <typename Torus>
-void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
-                          Torus n);
+void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                          Torus *d_array, Torus value, Torus n);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper.h
@@ -0,0 +1,10 @@
+#ifndef HELPER_H
+#define HELPER_H
+
+extern "C" {
+int cuda_setup_multi_gpu();
+}
+
+void multi_gpu_checks(uint32_t gpu_count);
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -6,16 +6,16 @@
 extern "C" {

 void cuda_keyswitch_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);

 void cuda_keyswitch_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -1,48 +1,48 @@
 #ifndef CUDA_LINALG_H_
 #define CUDA_LINALG_H_

-#include "bootstrap.h"
+#include "programmable_bootstrap.h"
 #include <cstdint>
 #include <device.h>

 extern "C" {

-void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count);
-void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                          void *lwe_array_out,
                                          void *lwe_array_in,
                                          uint32_t input_lwe_dimension,
                                          uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
                                       void *lwe_array_in_1,
                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
                                       void *lwe_array_out,
                                       void *lwe_array_in_1,
                                       void *lwe_array_in_2,
                                       uint32_t input_lwe_dimension,
                                       uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *plaintext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
    void *cleartext_array_in, uint32_t input_lwe_dimension,
    uint32_t input_lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -0,0 +1,431 @@
+#ifndef CUDA_BOOTSTRAP_H
+#define CUDA_BOOTSTRAP_H
+
+#include "device.h"
+#include <cstdint>
+
+enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
+enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
+
+extern "C" {
+void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
+                                 void *input1, void *input2, void *output,
+                                 uint32_t polynomial_size,
+                                 uint32_t total_polynomials);
+
+void cuda_convert_lwe_programmable_bootstrap_key_32(
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);
+
+void cuda_convert_lwe_programmable_bootstrap_key_64(
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size);
+
+void scratch_cuda_programmable_bootstrap_amortized_32(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void scratch_cuda_programmable_bootstrap_amortized_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory);
+
+void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory);
+
+void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
+                                                   uint32_t gpu_index,
+                                                   int8_t **pbs_buffer);
+
+void scratch_cuda_programmable_bootstrap_32(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void scratch_cuda_programmable_bootstrap_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory);
+
+void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory);
+
+void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
+                                         int8_t **pbs_buffer);
+
+uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+
+uint64_t get_buffer_size_programmable_bootstrap_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_programmable_bootstrap_step_one(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_programmable_bootstrap_step_two(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_programmable_bootstrap_tbc(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // tbc
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
+}
+
+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_classic_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory);
+
+template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
+
+template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
+  int8_t *d_mem;
+
+  Torus *global_accumulator;
+  double2 *global_accumulator_fft;
+
+  PBS_VARIANT pbs_variant;
+
+  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
+             uint32_t polynomial_size, uint32_t level_count,
+             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
+             bool allocate_gpu_memory) {
+
+    this->pbs_variant = pbs_variant;
+
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+    if (allocate_gpu_memory) {
+      switch (pbs_variant) {
+      case PBS_VARIANT::DEFAULT: {
+        uint64_t full_sm_step_one =
+            get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
+                polynomial_size);
+        uint64_t full_sm_step_two =
+            get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap<Torus>(
+                polynomial_size);
+
+        uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+        uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+        uint64_t full_dm = full_sm_step_one;
+
+        uint64_t device_mem = 0;
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_two) {
+          device_mem =
+              (partial_dm_step_two + partial_dm_step_one * level_count) *
+              input_lwe_ciphertext_count * (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_one) {
+          device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                       level_count * (glwe_dimension + 1);
+        }
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                (polynomial_size / 2) * sizeof(double2),
+            stream, gpu_index);
+
+        global_accumulator = (Torus *)cuda_malloc_async(
+            (glwe_dimension + 1) * input_lwe_ciphertext_count *
+                polynomial_size * sizeof(Torus),
+            stream, gpu_index);
+      } break;
+      case PBS_VARIANT::CG: {
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
+                polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                polynomial_size / 2 * sizeof(double2),
+            stream, gpu_index);
+      } break;
+#if CUDA_ARCH >= 900
+      case PBS_VARIANT::TBC: {
+
+        bool supports_dsm =
+            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+                Torus>(polynomial_size, max_shared_memory);
+
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+                polynomial_size);
+        uint64_t minimum_sm_tbc = 0;
+        if (supports_dsm)
+          minimum_sm_tbc =
+              get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
+                  Torus>(polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
+        // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+        if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
+
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                polynomial_size / 2 * sizeof(double2),
+            stream, gpu_index);
+      } break;
+#endif
+      default:
+        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      }
+    }
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+    cuda_drop_async(d_mem, stream, gpu_index);
+    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+
+    if (pbs_variant == DEFAULT)
+      cuda_drop_async(global_accumulator, stream, gpu_index);
+  }
+};
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  uint64_t full_sm =
+      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
+          polynomial_size);
+  uint64_t partial_dm = full_sm - partial_sm;
+  uint64_t full_dm = full_sm;
+  uint64_t device_mem = 0;
+  if (max_shared_memory < partial_sm) {
+    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
+  } else if (max_shared_memory < full_sm) {
+    device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
+  }
+  uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
+                                          input_lwe_ciphertext_count *
+                                          polynomial_size / 2 * sizeof(double2);
+  return buffer_size + buffer_size % sizeof(double2);
+}
+
+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
+                                                   uint32_t polynomial_size,
+                                                   uint32_t level_count,
+                                                   uint32_t num_samples,
+                                                   uint32_t max_shared_memory);
+
+template <typename Torus>
+void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+
+template <typename Torus>
+void cuda_programmable_bootstrap_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+
+#if (CUDA_ARCH >= 900)
+template <typename Torus>
+void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_programmable_bootstrap_tbc(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+#endif
+
+template <typename Torus, typename STorus>
+void scratch_cuda_programmable_bootstrap_cg(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_programmable_bootstrap(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
+                                                    uint32_t glwe_dimension,
+                                                    uint32_t polynomial_size,
+                                                    uint32_t level_count,
+                                                    uint32_t max_shared_memory);
+
+#ifdef __CUDACC__
+__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
+                                         int glwe_dimension,
+                                         uint32_t level_count);
+
+template <typename T>
+__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count);
+
+template <typename T>
+__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count);
+
+template <typename T>
+__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+
+#endif
+
+#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -0,0 +1,339 @@
+#ifndef CUDA_MULTI_BIT_H
+#define CUDA_MULTI_BIT_H
+
+#include "programmable_bootstrap.h"
+#include <cstdint>
+
+extern "C" {
+
+bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples, uint32_t max_shared_memory);
+
+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
+    void *stream, uint32_t gpu_index, void *dest, void *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor);
+
+void scratch_cuda_multi_bit_programmable_bootstrap_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t chunk_size = 0);
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+
+void scratch_cuda_generic_multi_bit_programmable_bootstrap_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+
+void cuda_generic_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lut_vector, void *lut_vector_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
+    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
+                                                   uint32_t gpu_index,
+                                                   int8_t **pbs_buffer);
+}
+
+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory);
+
+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory);
+
+#if CUDA_ARCH >= 900
+template <typename Torus, typename STorus>
+void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+
+template <typename Torus>
+void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size);
+#endif
+
+template <typename Torus, typename STorus>
+void scratch_cuda_cg_multi_bit_programmable_bootstrap(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_cg_multi_bit_programmable_bootstrap(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+
+template <typename Torus>
+void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_multi_bit_programmable_bootstrap(
+    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+
+template <typename Torus>
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
+    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0);
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+
+template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
+  int8_t *d_mem_keybundle = NULL;
+  int8_t *d_mem_acc_step_one = NULL;
+  int8_t *d_mem_acc_step_two = NULL;
+  int8_t *d_mem_acc_cg = NULL;
+  int8_t *d_mem_acc_tbc = NULL;
+
+  double2 *keybundle_fft;
+  Torus *global_accumulator;
+  double2 *global_accumulator_fft;
+
+  PBS_VARIANT pbs_variant;
+
+  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
+             uint32_t polynomial_size, uint32_t level_count,
+             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
+             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
+    this->pbs_variant = pbs_variant;
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+    // default
+    uint64_t full_sm_keybundle =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
+            Torus>(polynomial_size);
+    uint64_t full_sm_accumulate_step_one =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
+            polynomial_size);
+    uint64_t full_sm_accumulate_step_two =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
+            polynomial_size);
+    uint64_t partial_sm_accumulate_step_one =
+        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+            Torus>(polynomial_size);
+    // cg
+    uint64_t full_sm_cg_accumulate =
+        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    uint64_t partial_sm_cg_accumulate =
+        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
+                                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                level_count;
+    auto num_blocks_acc_step_one =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+    auto num_blocks_acc_step_two =
+        input_lwe_ciphertext_count * (glwe_dimension + 1);
+    auto num_blocks_acc_cg =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+
+#if CUDA_ARCH >= 900
+    uint64_t full_sm_tbc_accumulate =
+        get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    uint64_t partial_sm_tbc_accumulate =
+        get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    uint64_t minimum_sm_tbc =
+        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    auto num_blocks_acc_tbc = num_blocks_acc_cg;
+#endif
+
+    if (allocate_gpu_memory) {
+      // Keybundle
+      if (max_shared_memory < full_sm_keybundle)
+        d_mem_keybundle = (int8_t *)cuda_malloc_async(
+            num_blocks_keybundle * full_sm_keybundle, stream, gpu_index);
+
+      switch (pbs_variant) {
+      case PBS_VARIANT::CG:
+        // Accumulator CG
+        if (max_shared_memory < partial_sm_cg_accumulate)
+          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index);
+        else if (max_shared_memory < full_sm_cg_accumulate)
+          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index);
+        break;
+      case PBS_VARIANT::DEFAULT:
+        // Accumulator step one
+        if (max_shared_memory < partial_sm_accumulate_step_one)
+          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+              gpu_index);
+        else if (max_shared_memory < full_sm_accumulate_step_one)
+          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+              gpu_index);
+
+        // Accumulator step two
+        if (max_shared_memory < full_sm_accumulate_step_two)
+          d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+              gpu_index);
+        break;
+#if CUDA_ARCH >= 900
+      case TBC:
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
+        // partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
+        // will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+
+        // Accumulator TBC
+        if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
+          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index);
+        else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
+          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream,
+              gpu_index);
+        break;
+#endif
+      default:
+        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      }
+
+      keybundle_fft = (double2 *)cuda_malloc_async(
+          num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
+          stream, gpu_index);
+      global_accumulator = (Torus *)cuda_malloc_async(
+          num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
+          gpu_index);
+      global_accumulator_fft = (double2 *)cuda_malloc_async(
+          num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
+          stream, gpu_index);
+    }
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+
+    if (d_mem_keybundle)
+      cuda_drop_async(d_mem_keybundle, stream, gpu_index);
+    switch (pbs_variant) {
+    case DEFAULT:
+      if (d_mem_acc_step_one)
+        cuda_drop_async(d_mem_acc_step_one, stream, gpu_index);
+      if (d_mem_acc_step_two)
+        cuda_drop_async(d_mem_acc_step_two, stream, gpu_index);
+      break;
+    case CG:
+      if (d_mem_acc_cg)
+        cuda_drop_async(d_mem_acc_cg, stream, gpu_index);
+      break;
+#if CUDA_ARCH >= 900
+    case TBC:
+      if (d_mem_acc_tbc)
+        cuda_drop_async(d_mem_acc_tbc, stream, gpu_index);
+      break;
+#endif
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    cuda_drop_async(keybundle_fft, stream, gpu_index);
+    cuda_drop_async(global_accumulator, stream, gpu_index);
+    cuda_drop_async(global_accumulator_fft, stream, gpu_index);
+  }
+};
+
+template <typename Torus, class params>
+__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
+                                     uint32_t polynomial_size,
+                                     uint32_t max_shared_memory);
+
+#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -10,13 +10,10 @@ set(SOURCES
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
-set_target_properties(
-  tfhe_cuda_backend
-  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
-             CUDA_RESOLVE_DEVICE_SYMBOLS ON
-             CUDA_ARCHITECTURES native)
+set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1 +1,21 @@
 #include "ciphertext.cuh"
+
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
+                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension) {
+  cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
+      (uint64_t *)src, number_of_cts, lwe_dimension);
+}
+
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
+                                                  uint32_t gpu_index,
+                                                  void *dest, void *src,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension) {
+  cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)dest,
+      (uint64_t *)src, number_of_cts, lwe_dimension);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -6,39 +6,23 @@
 #include <cstdint>

 template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
-                                               cuda_stream_t *stream,
-                                               uint32_t number_of_cts,
+void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
+                                               uint32_t gpu_index, T *dest,
+                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_gpu(dest, src, size, stream);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  cuda_stream_t *stream,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
+  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
 }

 template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
-                                               cuda_stream_t *stream,
-                                               uint32_t number_of_cts,
+void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
+                                               uint32_t gpu_index, T *dest,
+                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cudaSetDevice(stream->gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_cpu(dest, src, size, stream);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  cuda_stream_t *stream,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
+  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -20,9 +20,7 @@ private:
  uint32_t level_count;
  uint32_t base_log;
  uint32_t mask;
-  uint32_t halfbg;
  uint32_t num_poly;
-  T offset;
  int current_level;
  T mask_mod_b;
  T *state;
@@ -82,72 +80,12 @@ public:
    synchronize_threads_in_block();
  }

-  // Decomposes a single polynomial
-  __device__ void
-  decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
-    if (j == 0)
-      current_level -= 1;
-
-    int tid = threadIdx.x;
-    auto state_slice = state + j * params::degree;
-    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state_slice[tid] & mask_mod_b;
-      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
-      state_slice[tid] >>= base_log;
-      state_slice[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
-      T carry_im =
-          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
-      carry_re >>= (base_log - 1);
-      carry_im >>= (base_log - 1);
-      state_slice[tid] += carry_re;
-      state_slice[tid + params::degree / 2] += carry_im;
-      res_re -= carry_re << base_log;
-      res_im -= carry_im << base_log;
-
-      result[i].x = (int32_t)res_re;
-      result[i].y = (int32_t)res_im;
-
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
  __device__ void decompose_and_compress_level(double2 *result, int level) {
    for (int i = 0; i < level_count - level; i++)
      decompose_and_compress_next(result);
  }
 };

-template <typename T> class GadgetMatrixSingle {
-private:
-  uint32_t level_count;
-  uint32_t base_log;
-  uint32_t mask;
-  uint32_t halfbg;
-  T offset;
-
-public:
-  __device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
-      : base_log(base_log), level_count(level_count) {
-    uint32_t bg = 1 << base_log;
-    this->halfbg = bg / 2;
-    this->mask = bg - 1;
-    T temp = 0;
-    for (int i = 0; i < this->level_count; i++) {
-      temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
-    }
-    this->offset = temp * this->halfbg;
-  }
-
-  __device__ T decompose_one_level_single(T element, uint32_t level) {
-    T s = element + this->offset;
-    uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
-    T temp1 = (s >> decal) & this->mask;
-    return (T)(temp1 - this->halfbg);
-  }
-};
-
 template <typename Torus>
 __device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
  Torus res = state & mask_mod_b;
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -49,11 +49,15 @@ __global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
 * global memory
 */
 template <typename T, typename ST, class params>
-void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
+void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
+                           uint32_t gpu_count, double2 *dest, T *src,
                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
                           uint32_t polynomial_size, uint32_t level_count,
-                           uint32_t gpu_index, uint32_t max_shared_memory) {
-  cudaSetDevice(stream->gpu_index);
+                           uint32_t max_shared_memory) {
+  if (gpu_count != 1)
+    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
+          "supported yet.")
+  cudaSetDevice(gpu_indexes[0]);

  int shared_memory_size = sizeof(double) * polynomial_size;

@@ -62,11 +66,11 @@ void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,

  if (max_shared_memory < shared_memory_size) {
    device_batch_fft_ggsw_vector<T, ST, params, NOSM>
-        <<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
+        <<<gridSize, blockSize, 0, streams[0]>>>(dest, src, d_mem);
  } else {
    device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
-        <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
-                                                                      d_mem);
+        <<<gridSize, blockSize, shared_memory_size, streams[0]>>>(dest, src,
+                                                                  d_mem);
  }
  check_cuda_error(cudaGetLastError());
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -6,12 +6,13 @@
 * Head out to the equivalent operation on 64 bits for more details.
 */
 void cuda_keyswitch_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, static_cast<uint32_t *>(lwe_array_out),
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
      static_cast<uint32_t *>(lwe_output_indexes),
      static_cast<uint32_t *>(lwe_array_in),
      static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
@@ -35,12 +36,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
 * 	- num_samples blocks of threads are launched
 */
 void cuda_keyswitch_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
+    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_output_indexes),
      static_cast<uint64_t *>(lwe_array_in),
      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -35,109 +35,60 @@ template <typename Torus>
 __global__ void
 keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
          Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
-          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-          int lwe_lower, int lwe_upper, int cutoff) {
+          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
  int tid = threadIdx.x;
-
  extern __shared__ int8_t sharedmem[];
+  if (tid <= lwe_dimension_out) {
+    Torus *local_lwe_array_out = (Torus *)sharedmem;
+    auto block_lwe_array_in = get_chunk(
+        lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
+    auto block_lwe_array_out = get_chunk(
+        lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
+    local_lwe_array_out[tid] = 0;

-  Torus *local_lwe_array_out = (Torus *)sharedmem;
+    if (tid == lwe_dimension_out) {
+      local_lwe_array_out[lwe_dimension_out] =
+          block_lwe_array_in[lwe_dimension_in];
+    }

-  auto block_lwe_array_in = get_chunk(
-      lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
-  auto block_lwe_array_out = get_chunk(
-      lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
-
-  auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
-
-  int lwe_part_per_thd;
-  if (tid < cutoff) {
-    lwe_part_per_thd = lwe_upper;
-  } else {
-    lwe_part_per_thd = lwe_lower;
-  }
-  __syncthreads();
-
-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    local_lwe_array_out[idx] = 0;
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    local_lwe_array_out[lwe_dimension_out] =
-        block_lwe_array_in[lwe_dimension_in];
-  }
-
-  for (int i = 0; i < lwe_dimension_in; i++) {
-
-    __syncthreads();
-
-    Torus a_i =
-        round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
-
-    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
-    Torus mask_mod_b = (1ll << base_log) - 1ll;
-
-    for (int j = 0; j < level_count; j++) {
-      auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
-      Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-      for (int k = 0; k < lwe_part_per_thd; k++) {
-        int idx = tid + k * blockDim.x;
-        local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
+    for (int i = 0; i < lwe_dimension_in; i++) {
+      Torus a_i = round_to_closest_multiple(block_lwe_array_in[i], base_log,
+                                            level_count);
+      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+      Torus mask_mod_b = (1ll << base_log) - 1ll;
+      for (int j = 0; j < level_count; j++) {
+        auto ksk_block =
+            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
+        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
+        local_lwe_array_out[tid] -= (Torus)ksk_block[tid] * decomposed;
      }
    }
-  }
-
-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    block_lwe_array_out[idx] = local_lwe_array_out[idx];
+    block_lwe_array_out[tid] = local_lwe_array_out[tid];
  }
 }

 /// assume lwe_array_in in the gpu
 template <typename Torus>
 __host__ void cuda_keyswitch_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

-  cudaSetDevice(stream->gpu_index);
-  constexpr int ideal_threads = 128;
-
-  int lwe_dim = lwe_dimension_out + 1;
-  int lwe_lower, lwe_upper, cutoff;
-  if (lwe_dim % ideal_threads == 0) {
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = lwe_dim / ideal_threads;
-    cutoff = 0;
-  } else {
-    int y =
-        ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
-    cutoff = ideal_threads - y;
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
-  }
-
-  int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
-
-  int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
-
-  cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
-  check_cuda_error(cudaGetLastError());
+  cudaSetDevice(gpu_index);
+  constexpr int ideal_threads = 1024;
+  if (lwe_dimension_out + 1 > ideal_threads)
+    PANIC("Cuda error (keyswitch): lwe dimension size out should be greater "
+          "or equal to the number of threads per block")

+  int lwe_size = lwe_dimension_out + 1;
+  int shared_mem = sizeof(Torus) * lwe_size;
  dim3 grid(num_samples, 1, 1);
  dim3 threads(ideal_threads, 1, 1);

-  //    cudaFuncSetAttribute(keyswitch<Torus>,
-  //                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-  //                         shared_mem);
-
-  keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
+  keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
-      lwe_upper, cutoff);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -3,46 +3,53 @@
 #include <cuda_runtime.h>

 /// Unsafe function to create a CUDA stream, must check first that GPU exists
-cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
-  cuda_stream_t *stream = new cuda_stream_t(gpu_index);
+cudaStream_t cuda_create_stream(uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  cudaStream_t stream;
+  check_cuda_error(cudaStreamCreate(&stream));
  return stream;
 }

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
-int cuda_destroy_stream(cuda_stream_t *stream) {
-  stream->release();
-  return 0;
+void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamDestroy(stream));
+}
+
+void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamSynchronize(stream));
 }

 /// Unsafe function that will try to allocate even if gpu_index is invalid
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  void *ptr;
-  cudaMalloc((void **)&ptr, size);
-  check_cuda_error(cudaGetLastError());
+  check_cuda_error(cudaMalloc((void **)&ptr, size));

  return ptr;
 }

 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
-void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
-  cudaSetDevice(stream->gpu_index);
+void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
+                        uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
  void *ptr;

 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
-                                          cudaDevAttrMemoryPoolsSupported,
-                                          stream->gpu_index));
+  check_cuda_error(cudaDeviceGetAttribute(
+      &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));

  if (support_async_alloc) {
-    check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
+    cuda_synchronize_stream(stream, gpu_index);
+    check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream));
+    cuda_synchronize_stream(stream, gpu_index);
  } else {
    check_cuda_error(cudaMalloc((void **)&ptr, size));
  }
@@ -52,184 +59,102 @@ void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
  return ptr;
 }

-/// Checks that allocation is valid
-/// 0: valid
-/// -1: invalid, not enough memory in device
-/// -2: invalid, gpu index doesn't exist
-int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
+/// Check that allocation is valid
+void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
  size_t total_mem, free_mem;
-  cudaMemGetInfo(&free_mem, &total_mem);
+  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
  if (size > free_mem) {
-    // error code: not enough memory
-    return -1;
+    PANIC("Cuda error: not enough memory on device. "
+          "Available: %zu vs Requested: %lu",
+          free_mem, size)
  }
-  return 0;
 }

 /// Returns
-///  -> 0 if Cooperative Groups is not supported.
-///  -> 1 otherwise
-int cuda_check_support_cooperative_groups() {
+///  false if Cooperative Groups is not supported.
+///  true otherwise
+bool cuda_check_support_cooperative_groups() {
  int cooperative_groups_supported = 0;
-  cudaDeviceGetAttribute(&cooperative_groups_supported,
-                         cudaDevAttrCooperativeLaunch, 0);
+  check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
+                                          cudaDevAttrCooperativeLaunch, 0));

  return cooperative_groups_supported > 0;
 }

-/// Tries to copy memory to the GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                             cuda_stream_t *stream) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
-  }
-
-  cudaSetDevice(stream->gpu_index);
+/// Returns
+///  false if Thread Block Cluster is not supported.
+///  true otherwise
+bool cuda_check_support_thread_block_clusters() {
+#if CUDA_ARCH >= 900
+  // To-do: Is this really the best way to check support?
+  int tbc_supported = 0;
  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
-  return 0;
+      cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
+
+  return tbc_supported > 0;
+#else
+  return false;
+#endif
 }

-/// Tries to copy memory to the GPU synchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
+/// Copy memory to the GPU asynchronously
+void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+                              cudaStream_t stream, uint32_t gpu_index) {
+  if (size == 0)
+    return;
  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
-  if (attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
  }

-  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
-  return 0;
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(
+      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }

-/// Tries to copy memory to the CPU synchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, src);
-  if (attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
-  }
-
-  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
-  return 0;
-}
-
-/// Tries to copy memory within a GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                 cuda_stream_t *stream) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
+/// Copy memory within a GPU asynchronously
+void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                                  cudaStream_t stream, uint32_t gpu_index) {
+  if (size == 0)
+    return;
  cudaPointerAttributes attr_dest;
-  cudaPointerGetAttributes(&attr_dest, dest);
-  if (attr_dest.device != stream->gpu_index &&
-      attr_dest.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
+  if (attr_dest.device != gpu_index && attr_dest.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
  }
  cudaPointerAttributes attr_src;
-  cudaPointerGetAttributes(&attr_src, src);
-  if (attr_src.device != stream->gpu_index &&
-      attr_src.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
+  if (attr_src.device != gpu_index && attr_src.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
  if (attr_src.device != attr_dest.device) {
-    // error code: different devices
-    return -1;
+    PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
  }

-  cudaSetDevice(stream->gpu_index);
-  check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
-                                   stream->stream));
-  return 0;
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(
+      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
 }

 /// Synchronizes device
-/// 0: success
-/// -2: error, gpu index doesn't exist
-int cuda_synchronize_device(uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
-  cudaDeviceSynchronize();
-  return 0;
+void cuda_synchronize_device(uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaDeviceSynchronize());
 }

-int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                      cuda_stream_t *stream) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
+void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+                       cudaStream_t stream, uint32_t gpu_index) {
+  if (size == 0)
+    return;
  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
  }
-  cudaSetDevice(stream->gpu_index);
-  check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
-  return 0;
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

 template <typename Torus>
@@ -240,111 +165,99 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
 }

 template <typename Torus>
-void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
-                          Torus n) {
+void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                          Torus *d_array, Torus value, Torus n) {
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
+  if (attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid dest device pointer in cuda set value.")
+  }
+  check_cuda_error(cudaSetDevice(gpu_index));
  int block_size = 256;
  int num_blocks = (n + block_size - 1) / block_size;

  // Launch the kernel
-  cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
-                                                                n);
+  cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
+                                                               n);
+  check_cuda_error(cudaGetLastError());
 }

 /// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
-template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
-                                   uint64_t value, uint64_t n);
-template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
-                                   uint32_t value, uint32_t n);
+template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                                   uint64_t *d_array, uint64_t value,
+                                   uint64_t n);
+template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
+                                   uint32_t *d_array, uint32_t value,
+                                   uint32_t n);

-/// Tries to copy memory to the GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                             cuda_stream_t *stream) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
+/// Copy memory to the CPU asynchronously
+void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                              cudaStream_t stream, uint32_t gpu_index) {
+  if (size == 0)
+    return;
  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, src);
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+  check_cuda_error(cudaPointerGetAttributes(&attr, src));
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
  }

-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
-  return 0;
+      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }

 /// Return number of GPUs available
 int cuda_get_number_of_gpus() {
  int num_gpus;
-  cudaGetDeviceCount(&num_gpus);
+  check_cuda_error(cudaGetDeviceCount(&num_gpus));
  return num_gpus;
 }

 /// Drop a cuda array
-int cuda_drop(void *ptr, uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
+void cuda_drop(void *ptr, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaFree(ptr));
-  return 0;
 }

-/// Drop a cuda array. Tries to do it asynchronously
-int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
+/// Drop a cuda array asynchronously, if supported on the device
+void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
-                                          cudaDevAttrMemoryPoolsSupported,
-                                          stream->gpu_index));
+  check_cuda_error(cudaDeviceGetAttribute(
+      &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));

  if (support_async_alloc) {
-    check_cuda_error(cudaFreeAsync(ptr, stream->stream));
+    check_cuda_error(cudaFreeAsync(ptr, stream));
  } else {
    check_cuda_error(cudaFree(ptr));
  }
 #else
  check_cuda_error(cudaFree(ptr));
 #endif
-  return 0;
 }

 /// Get the maximum size for the shared memory
 int cuda_get_max_shared_memory(uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  int max_shared_memory = 0;
-  if (prop.major >= 6) {
-    max_shared_memory = prop.sharedMemPerMultiprocessor;
-  } else {
-    max_shared_memory = prop.sharedMemPerBlock;
-  }
+  cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
+                         gpu_index);
+  check_cuda_error(cudaGetLastError());
  return max_shared_memory;
 }

-int cuda_synchronize_stream(cuda_stream_t *stream) {
-  stream->synchronize();
-  return 0;
+void cuda_stream_add_callback(cudaStream_t stream, uint32_t gpu_index,
+                              cudaStreamCallback_t callback, void *user_data) {
+
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaStreamAddCallback(stream, callback, user_data, 0));
+}
+
+void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
+                                  void *host_pointer) {
+  free(host_pointer);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -181,7 +181,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
  // from level 8, we need to check size of params degree, because we support
  // minimum actual polynomial size = 256,  when compressed size is halfed and
  // minimum supported compressed size is 128, so we always need first 7
-  // levels of butterfy operation, since butterfly levels are hardcoded
+  // levels of butterfly operation, since butterfly levels are hardcoded
  // we need to check if polynomial size is big enough to require specific level
  // of butterfly.
  if constexpr (params::degree >= 256) {
@@ -353,7 +353,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {

  // compressed size = 8192 is actual polynomial size = 16384.
  // twiddles for this size can't fit in constant memory so
-  // butterfly operation for this level acess device memory to fetch
+  // butterfly operation for this level access device memory to fetch
  // twiddles
  if constexpr (params::degree >= 8192) {
    // level 13
@@ -484,7 +484,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  // below level 8, we don't need to check size of params degree, because we
  // support minimum actual polynomial size = 256,  when compressed size is
  // halfed and minimum supported compressed size is 128, so we always need
-  // last 7 levels of butterfy operation, since butterfly levels are hardcoded
+  // last 7 levels of butterfly operation, since butterfly levels are hardcoded
  // we don't need to check if polynomial size is big enough to require
  // specific level of butterfly.
  // level 7
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
@@ -3,7 +3,7 @@

 /*
 * 'negtwiddles' are stored in constant memory for faster access times
- * because of it's limitied size, only twiddles for up to 2^12 polynomial size
+ * because of it's limited size, only twiddles for up to 2^12 polynomial size
 * can be stored there, twiddles for 2^13 are stored in device memory
 * 'negtwiddles13'
 */
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,7 +1,7 @@
 #include "integer/bitwise_ops.cuh"

 void scratch_cuda_integer_radix_bitop_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -15,17 +15,19 @@ void scratch_cuda_integer_radix_bitop_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_bitop_kb<uint64_t>(
-      stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
-      params, op_type, allocate_gpu_memory);
+      static_cast<cudaStream_t>(stream), gpu_index,
+      (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count, params,
+      op_type, allocate_gpu_memory);
 }

 void cuda_bitop_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
-    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
+    void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitop_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_1),
      static_cast<uint64_t *>(lwe_array_2),
      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
@@ -33,19 +35,22 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
 }

 void cuda_bitnot_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_in, int8_t *mem_ptr, void *bsk,
+    void *ksk, uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitnot_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_array_in),
      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
      lwe_ciphertext_count);
 }

-void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
+void cleanup_cuda_integer_bitop(void *stream, uint32_t gpu_index,
+                                int8_t **mem_ptr_void) {

  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -5,15 +5,16 @@
 #include "device.h"
 #include "integer.cuh"
 #include "integer.h"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
+#include "pbs/programmable_bootstrap_classic.cuh"
+#include "pbs/programmable_bootstrap_multibit.cuh"
 #include "polynomial/functions.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <omp.h>

 template <typename Torus>
 __host__ void
-host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+host_integer_radix_bitop_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                            uint32_t gpu_count, Torus *lwe_array_out,
                            Torus *lwe_array_1, Torus *lwe_array_2,
                            int_bitop_buffer<Torus> *mem_ptr, void *bsk,
                            Torus *ksk, uint32_t num_radix_blocks) {
@@ -21,31 +22,32 @@ host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
  auto lut = mem_ptr->lut;

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
-      num_radix_blocks, lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
+      bsk, ksk, num_radix_blocks, lut, lut->params.message_modulus);
 }

 template <typename Torus>
-__host__ void
-host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                             Torus *lwe_array_in,
-                             int_bitop_buffer<Torus> *mem_ptr, void *bsk,
-                             Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void host_integer_radix_bitnot_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in, int_bitop_buffer<Torus> *mem_ptr,
+    void *bsk, Torus *ksk, uint32_t num_radix_blocks) {

  auto lut = mem_ptr->lut;

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsk, ksk,
+      num_radix_blocks, lut);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_bitop_kb(
-    cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
+    cudaStream_t stream, uint32_t gpu_index, int_bitop_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
    bool allocate_gpu_memory) {

-  *mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
-                                         allocate_gpu_memory);
+  cudaSetDevice(gpu_index);
+  *mem_ptr = new int_bitop_buffer<Torus>(stream, gpu_index, op, params,
+                                         num_radix_blocks, allocate_gpu_memory);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,7 +1,7 @@
 #include "integer/cmux.cuh"

 void scratch_cuda_integer_radix_cmux_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -17,17 +17,20 @@ void scratch_cuda_integer_radix_cmux_kb_64(
      [](uint64_t x) -> uint64_t { return x == 1; };

  scratch_cuda_integer_radix_cmux_kb(
-      stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
+      static_cast<cudaStream_t>(stream), gpu_index,
+      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
 }

 void cuda_cmux_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
-    void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_condition, void *lwe_array_true,
+    void *lwe_array_false, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t lwe_ciphertext_count) {

  host_integer_radix_cmux_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t *>(lwe_condition),
      static_cast<uint64_t *>(lwe_array_true),
      static_cast<uint64_t *>(lwe_array_false),
@@ -36,10 +39,10 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
      lwe_ciphertext_count);
 }

-void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
+void cleanup_cuda_integer_radix_cmux(void *stream, uint32_t gpu_index,
                                     int8_t **mem_ptr_void) {

  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -5,11 +5,13 @@
 #include <omp.h>

 template <typename Torus>
-__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
+__host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
+                          uint32_t gpu_count, Torus *lwe_array_out,
                          Torus *lwe_array_input, Torus *lwe_condition,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
                          int_radix_lut<Torus> *predicate, void *bsk,
                          Torus *ksk, uint32_t num_radix_blocks) {
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

  int big_lwe_size = params.big_lwe_dimension + 1;
@@ -26,32 +28,31 @@ __host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;

-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
-                                   stream->stream>>>(
-        lwe_array_out_block, lwe_array_input_block, lwe_condition,
-        predicate->lwe_indexes, params.big_lwe_dimension,
+    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
+        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
+        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
        params.message_modulus, 1);
    check_cuda_error(cudaGetLastError());
  }

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
-      predicate);
+      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsk,
+      ksk, num_radix_blocks, predicate);
 }

 template <typename Torus>
-__host__ void
-host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                           Torus *lwe_condition, Torus *lwe_array_true,
-                           Torus *lwe_array_false,
-                           int_cmux_buffer<Torus> *mem_ptr, void *bsk,
-                           Torus *ksk, uint32_t num_radix_blocks) {
+__host__ void host_integer_radix_cmux_kb(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_condition, Torus *lwe_array_true,
+    Torus *lwe_array_false, int_cmux_buffer<Torus> *mem_ptr, void *bsk,
+    Torus *ksk, uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

  // Since our CPU threads will be working on different streams we shall assert
  // the work in the main stream is completed
-  stream->synchronize();
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
  auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
  auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;

@@ -61,40 +62,43 @@ host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
 #pragma omp section
    {
      auto mem_true = mem_ptr->zero_if_true_buffer;
-      zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
-                  lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
-                  ksk, num_radix_blocks);
+      zero_out_if(&true_stream, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                  lwe_array_true, lwe_condition, mem_true,
+                  mem_ptr->inverted_predicate_lut, bsk, ksk, num_radix_blocks);
    }
 #pragma omp section
    {
      auto mem_false = mem_ptr->zero_if_false_buffer;
-      zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
-                  lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
-                  num_radix_blocks);
+      zero_out_if(&false_stream, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
+                  lwe_array_false, lwe_condition, mem_false,
+                  mem_ptr->predicate_lut, bsk, ksk, num_radix_blocks);
    }
  }
-  cuda_synchronize_stream(true_stream);
-  cuda_synchronize_stream(false_stream);
+  cuda_synchronize_stream(true_stream, gpu_indexes[0]);
+  cuda_synchronize_stream(false_stream, gpu_indexes[0]);

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
-                params.big_lwe_dimension, num_radix_blocks);
+  host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
+                mem_ptr->tmp_false_ct, params.big_lwe_dimension,
+                num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
-      mem_ptr->message_extract_lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsk, ksk,
+      num_radix_blocks, mem_ptr->message_extract_lut);
 }

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_cmux_kb(
-    cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
+    cudaStream_t stream, uint32_t gpu_index, int_cmux_buffer<Torus> **mem_ptr,
    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

-  *mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
-                                        num_radix_blocks, allocate_gpu_memory);
+  cudaSetDevice(gpu_index);
+  *mem_ptr =
+      new int_cmux_buffer<Torus>(stream, gpu_index, predicate_lut_f, params,
+                                 num_radix_blocks, allocate_gpu_memory);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,12 +1,12 @@
 #include "integer/comparison.cuh"

 void scratch_cuda_integer_radix_comparison_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -17,9 +17,10 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  switch (op_type) {
  case EQ:
  case NE:
-    scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
-        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
+    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
+        op_type, false, allocate_gpu_memory);
    break;
  case GT:
  case GE:
@@ -27,17 +28,18 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  case LE:
  case MAX:
  case MIN:
-    scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
-        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
+    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params,
+        op_type, is_signed, allocate_gpu_memory);
    break;
  }
 }

 void cuda_comparison_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
-    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t lwe_ciphertext_count) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *lwe_array_1, void *lwe_array_2, int8_t *mem_ptr,
+    void *bsk, void *ksk, uint32_t num_radix_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -45,39 +47,42 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  case EQ:
  case NE:
    host_integer_radix_equality_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
+        static_cast<uint64_t *>(ksk), num_radix_blocks);
    break;
  case GT:
  case GE:
  case LT:
  case LE:
    host_integer_radix_difference_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer,
        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
-        lwe_ciphertext_count);
+        num_radix_blocks);
    break;
  case MAX:
  case MIN:
    host_integer_radix_maxmin_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
+        static_cast<uint64_t *>(ksk), num_radix_blocks);
    break;
  default:
-    printf("Not implemented\n");
+    PANIC("Cuda error: integer operation not supported")
  }
 }

-void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
+void cleanup_cuda_integer_comparison(void *stream, uint32_t gpu_index,
                                     int8_t **mem_ptr_void) {

  int_comparison_buffer<uint64_t> *mem_ptr =
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -8,8 +8,8 @@
 #include "integer/cmux.cuh"
 #include "integer/negation.cuh"
 #include "integer/scalar_addition.cuh"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
+#include "pbs/programmable_bootstrap_classic.cuh"
+#include "pbs/programmable_bootstrap_multibit.cuh"
 #include "types/complex/operations.cuh"
 #include "utils/kernel_dimensions.cuh"

@@ -33,26 +33,37 @@ __global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
 }

 template <typename Torus>
-__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
-                                    Torus *input, uint32_t lwe_dimension,
+__host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
+                                    Torus *output, Torus *input,
+                                    uint32_t lwe_dimension,
                                    uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  // Add all blocks and store in sum
-  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
+  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
      output, input, lwe_dimension, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }

+/* This takes an array of lwe ciphertexts, where each is an encryption of
+ * either 0 or 1.
+ *
+ * It writes in lwe_array_out a single lwe ciphertext encrypting 1 if all input
+ * blocks are 1 otherwise the block encrypts 0
+ *
+ */
 template <typename Torus>
 __host__ void
-are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
+are_all_comparisons_block_true(cudaStream_t *streams, uint32_t *gpu_indexes,
+                               uint32_t gpu_count, Torus *lwe_array_out,
                               Torus *lwe_array_in,
                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
                               Torus *ksk, uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -62,63 +73,148 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,

  auto are_all_block_true_buffer =
      mem_ptr->eq_buffer->are_all_block_true_buffer;
+  auto tmp_out = are_all_block_true_buffer->tmp_out;

  uint32_t total_modulus = message_modulus * carry_modulus;
  uint32_t max_value = total_modulus - 1;

-  cuda_memcpy_async_gpu_to_gpu(
-      lwe_array_out, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
+  cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);

-  int lut_num_blocks = 0;
  uint32_t remaining_blocks = num_radix_blocks;
-  while (remaining_blocks > 1) {
+
+  while (remaining_blocks > 0) {
    // Split in max_value chunks
    uint32_t chunk_length = std::min(max_value, remaining_blocks);
    int num_chunks = remaining_blocks / chunk_length;

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = lwe_array_out;
+    auto input_blocks = tmp_out;
    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(stream, accumulator, input_blocks,
-                            big_lwe_dimension, chunk_length);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
+                            input_blocks, big_lwe_dimension, chunk_length);

      accumulator += (big_lwe_dimension + 1);
      remaining_blocks -= (chunk_length - 1);
      input_blocks += (big_lwe_dimension + 1) * chunk_length;
    }
    accumulator = are_all_block_true_buffer->tmp_block_accumulated;
+    auto is_equal_to_num_blocks_map =
+        &are_all_block_true_buffer->is_equal_to_lut_map;

    // Selects a LUT
    int_radix_lut<Torus> *lut;
    if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
      // is_non_zero_lut_buffer LUT
      lut = mem_ptr->eq_buffer->is_non_zero_lut;
-    } else if (chunk_length == max_value) {
-      // is_max_value LUT
-      lut = are_all_block_true_buffer->is_max_value_lut;
    } else {
-      // is_equal_to_num_blocks LUT
-      lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
-      if (chunk_length != lut_num_blocks) {
+      if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
+          (*is_equal_to_num_blocks_map).end()) {
+        // The LUT is already computed
+        lut = (*is_equal_to_num_blocks_map)[chunk_length];
+      } else {
+        // LUT needs to be computed
+        auto new_lut =
+            new int_radix_lut<Torus>(streams[0], gpu_indexes[0], params,
+                                     max_value, num_radix_blocks, true);
+
        auto is_equal_to_num_blocks_lut_f = [max_value,
                                             chunk_length](Torus x) -> Torus {
          return (x & max_value) == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-            carry_modulus, is_equal_to_num_blocks_lut_f);
+            streams[0], gpu_indexes[0], new_lut->lut, glwe_dimension,
+            polynomial_size, message_modulus, carry_modulus,
+            is_equal_to_num_blocks_lut_f);

-        // We don't have to generate this lut again
-        lut_num_blocks = chunk_length;
+        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
+        lut = new_lut;
      }
    }

    // Applies the LUT
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
+    if (remaining_blocks == 1) {
+      // In the last iteration we copy the output to the final address
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
+          1, lut);
+      return;
+    } else {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsk, ksk,
+          num_chunks, lut);
+    }
+  }
+}
+
+/* This takes an array of lwe ciphertexts, where each is an encryption of
+ * either 0 or 1.
+ *
+ * It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
+ * one input ciphertext encrypts 1 otherwise encrypts 0
+ */
+template <typename Torus>
+__host__ void is_at_least_one_comparisons_block_true(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    uint32_t num_radix_blocks) {
+
+  cudaSetDevice(gpu_indexes[0]);
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
+
+  uint32_t total_modulus = message_modulus * carry_modulus;
+  uint32_t max_value = total_modulus - 1;
+
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
+                               num_radix_blocks * (big_lwe_dimension + 1) *
+                                   sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
+
+  uint32_t remaining_blocks = num_radix_blocks;
+  while (remaining_blocks > 0) {
+    // Split in max_value chunks
+    uint32_t chunk_length = std::min(max_value, remaining_blocks);
+    int num_chunks = remaining_blocks / chunk_length;
+
+    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
+    // as in the worst case we will be adding `max_value` ones
+    auto input_blocks = mem_ptr->tmp_lwe_array_out;
+    auto accumulator = buffer->tmp_block_accumulated;
+    for (int i = 0; i < num_chunks; i++) {
+      accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
+                            input_blocks, big_lwe_dimension, chunk_length);
+
+      accumulator += (big_lwe_dimension + 1);
+      remaining_blocks -= (chunk_length - 1);
+      input_blocks += (big_lwe_dimension + 1) * chunk_length;
+    }
+    accumulator = buffer->tmp_block_accumulated;
+
+    // Selects a LUT
+    int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
+
+    // Applies the LUT
+    if (remaining_blocks == 1) {
+      // In the last iteration we copy the output to the final address
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsk, ksk,
+          1, lut);
+      return;
+    } else {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+          accumulator, bsk, ksk, num_chunks, lut);
+    }
  }
 }

@@ -143,10 +239,12 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
 // are_all_comparisons_block_true
 template <typename Torus>
 __host__ void host_compare_with_zero_equality(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
-    int32_t num_radix_blocks) {
+    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {

+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -171,89 +269,71 @@ __host__ void host_compare_with_zero_equality(

  if (num_radix_blocks == 1) {
    // Just copy
-    cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
+    cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes,
+                                 streams[0], gpu_indexes[0]);
    num_sum_blocks = 1;
  } else {
    uint32_t remainder_blocks = num_radix_blocks;
-
    auto sum_i = sum;
    auto chunk = lwe_array_in;
    while (remainder_blocks > 1) {
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

-      accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
-                            chunk_size);
+      accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
+                            big_lwe_dimension, chunk_size);

      num_sum_blocks++;
      remainder_blocks -= (chunk_size - 1);

      // Update operands
-      chunk += chunk_size * big_lwe_size;
+      chunk += (chunk_size - 1) * big_lwe_size;
      sum_i += big_lwe_size;
    }
  }

-  auto is_equal_to_zero_lut = mem_ptr->diff_buffer->is_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, sum, sum, bsk, ksk, num_sum_blocks, is_equal_to_zero_lut);
-  are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
-                                 num_sum_blocks);
-
-  // The result will be in the two first block. Everything else is
-  //  garbage.
-  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
+      streams, gpu_indexes, gpu_count, sum, sum, bsk, ksk, num_sum_blocks,
+      zero_comparison);
+  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                 sum, mem_ptr, bsk, ksk, num_sum_blocks);
 }

 template <typename Torus>
 __host__ void host_integer_radix_equality_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
-    Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks) {
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_1, Torus *lwe_array_2,
+    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto eq_buffer = mem_ptr->eq_buffer;

-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-
  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
  integer_radix_apply_bivariate_lookup_table_kb(
-      stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
-      eq_buffer->operator_lut);
+      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
+      bsk, ksk, num_radix_blocks, eq_buffer->operator_lut,
+      eq_buffer->operator_lut->params.message_modulus);

  // This takes a Vec of blocks, where each block is either 0 or 1.
  //
-  // It return a block encrypting 1 if all input blocks are 1
+  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
-                                 bsk, ksk, num_radix_blocks);
-
-  // Zero all blocks but the first
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_radix_equality_check_kb(
-    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_comparison_buffer<Torus>(
-      stream, op, params, num_radix_blocks, allocate_gpu_memory);
+  are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
+                                 comparisons, mem_ptr, bsk, ksk,
+                                 num_radix_blocks);
 }

 template <typename Torus>
 __host__ void
-compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *lwe_array_out,
                        Torus *lwe_array_left, Torus *lwe_array_right,
                        int_comparison_buffer<Torus> *mem_ptr, void *bsk,
                        Torus *ksk, uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -274,21 +354,21 @@ compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,

  // Subtract
  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
-                   big_lwe_dimension, num_radix_blocks);
+  host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
+                   lwe_array_right, big_lwe_dimension, num_radix_blocks);

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb(
-      stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
-      is_non_zero_lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsk, ksk,
+      num_radix_blocks, is_non_zero_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
-                                            big_lwe_dimension, num_radix_blocks,
-                                            message_modulus, carry_modulus);
+  host_integer_radix_add_scalar_one_inplace(
+      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
 }

 // Reduces a vec containing shortint blocks that encrypts a sign
@@ -296,12 +376,14 @@ compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
 // final sign
 template <typename Torus>
 __host__ void
-tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
+tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
+                    uint32_t gpu_count, Torus *lwe_array_out,
                    Torus *lwe_block_comparisons,
                    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
                    std::function<Torus(Torus)> sign_handler_f, void *bsk,
                    Torus *ksk, uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto params = tree_buffer->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -320,16 +402,19 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
  auto y = tree_buffer->tmp_y;
  if (x != lwe_block_comparisons)
    cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
-                                 big_lwe_size_bytes * num_radix_blocks, stream);
+                                 big_lwe_size_bytes * num_radix_blocks,
+                                 streams[0], gpu_indexes[0]);

  uint32_t partial_block_count = num_radix_blocks;

  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
-    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
+    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                partial_block_count, 4);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
+        streams, gpu_indexes, gpu_count, x, y, bsk, ksk,
+        partial_block_count >> 1, inner_tree_leaf);

    if ((partial_block_count % 2) != 0) {
      partial_block_count >>= 1;
@@ -339,7 +424,8 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
      auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;

      cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
-                                   big_lwe_size_bytes, stream);
+                                   big_lwe_size_bytes, streams[0],
+                                   gpu_indexes[0]);
    } else {
      partial_block_count >>= 1;
    }
@@ -350,7 +436,8 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
  std::function<Torus(Torus)> f;

  if (partial_block_count == 2) {
-    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
+    pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
+                partial_block_count, 4);

    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
      int msb = (x >> 2) & 3;
@@ -364,52 +451,60 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
    y = x;
    f = sign_handler_f;
  }
-  generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
-                                     polynomial_size, message_modulus,
-                                     carry_modulus, f);
+  generate_device_accumulator<Torus>(streams[0], gpu_indexes[0], last_lut->lut,
+                                     glwe_dimension, polynomial_size,
+                                     message_modulus, carry_modulus, f);

  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
-                                                 ksk, 1, last_lut);
+  integer_radix_apply_univariate_lookup_table_kb(
+      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsk, ksk, 1, last_lut);
 }

 template <typename Torus>
 __host__ void host_integer_radix_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
-    Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *lwe_array_left, Torus *lwe_array_right,
+    int_comparison_buffer<Torus> *mem_ptr,
    std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
-    uint32_t total_num_radix_blocks) {
+    uint32_t num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
  auto message_modulus = params.message_modulus;
  auto carry_modulus = params.carry_modulus;

-  uint32_t num_radix_blocks = total_num_radix_blocks;
+  uint32_t packed_num_radix_blocks = num_radix_blocks;
  auto lhs = lwe_array_left;
  auto rhs = lwe_array_right;
-  if (carry_modulus == message_modulus) {
+  if (carry_modulus >= message_modulus) {
    // Packing is possible
    // Pack inputs
    Torus *packed_left = diff_buffer->tmp_packed_left;
    Torus *packed_right = diff_buffer->tmp_packed_right;
-    pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
-                num_radix_blocks, message_modulus);
-    pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
-                num_radix_blocks, message_modulus);
+    // In case the ciphertext is signed, the sign block and the one before it
+    // are handled separately
+    if (mem_ptr->is_signed) {
+      packed_num_radix_blocks -= 2;
+    }
+    pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
+                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
+    pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
+                big_lwe_dimension, packed_num_radix_blocks, message_modulus);
    // From this point we have half number of blocks
-    num_radix_blocks /= 2;
+    packed_num_radix_blocks /= 2;

    // Clean noise
-    auto cleaning_lut = mem_ptr->cleaning_lut;
+    auto identity_lut = mem_ptr->identity_lut;
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
-        cleaning_lut);
+        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsk, ksk,
+        packed_num_radix_blocks, identity_lut);
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
-        cleaning_lut);
+        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsk, ksk,
+        packed_num_radix_blocks, identity_lut);

    lhs = packed_left;
    rhs = packed_right;
@@ -420,49 +515,105 @@ __host__ void host_integer_radix_difference_check_kb(
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
-                          num_radix_blocks);
+  auto num_comparisons = 0;
+  if (!mem_ptr->is_signed) {
+    // Compare packed blocks, or simply the total number of radix blocks in the
+    // inputs
+    compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
+                            rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
+    num_comparisons = packed_num_radix_blocks;
+  } else {
+    // Packing is possible
+    if (carry_modulus >= message_modulus) {
+      // Compare (num_radix_blocks - 2) / 2 packed blocks
+      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
+                              rhs, mem_ptr, bsk, ksk, packed_num_radix_blocks);
+
+      // Compare the last block before the sign block separately
+      auto identity_lut = mem_ptr->identity_lut;
+      Torus *last_left_block_before_sign_block =
+          diff_buffer->tmp_packed_left + packed_num_radix_blocks * big_lwe_size;
+      Torus *last_right_block_before_sign_block =
+          diff_buffer->tmp_packed_right +
+          packed_num_radix_blocks * big_lwe_size;
+      integer_radix_apply_univariate_lookup_table_kb(
+          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
+          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
+          identity_lut);
+      integer_radix_apply_univariate_lookup_table_kb(
+          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
+          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
+          identity_lut);
+      compare_radix_blocks_kb(
+          streams, gpu_indexes, gpu_count,
+          comparisons + packed_num_radix_blocks * big_lwe_size,
+          last_left_block_before_sign_block, last_right_block_before_sign_block,
+          mem_ptr, bsk, ksk, 1);
+      // Compare the sign block separately
+      integer_radix_apply_bivariate_lookup_table_kb(
+          streams, gpu_indexes, gpu_count,
+          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
+          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
+          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
+          mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
+      num_comparisons = packed_num_radix_blocks + 2;
+
+    } else {
+      compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
+                              lwe_array_left, lwe_array_right, mem_ptr, bsk,
+                              ksk, num_radix_blocks - 1);
+      // Compare the sign block separately
+      integer_radix_apply_bivariate_lookup_table_kb(
+          streams, gpu_indexes, gpu_count,
+          comparisons + (num_radix_blocks - 1) * big_lwe_size,
+          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
+          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
+          mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
+      num_comparisons = num_radix_blocks;
+    }
+  }

  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
-  tree_sign_reduction(stream, lwe_array_out, comparisons,
-                      mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
-                      ksk, num_radix_blocks);
-
-  // The result will be in the first block. Everything else is garbage.
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                    (total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
+  tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
+                      comparisons, mem_ptr->diff_buffer->tree_buffer,
+                      reduction_lut_f, bsk, ksk, num_comparisons);
 }

 template <typename Torus>
-__host__ void scratch_cuda_integer_radix_difference_check_kb(
-    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
+__host__ void scratch_cuda_integer_radix_comparison_check_kb(
+    cudaStream_t stream, uint32_t gpu_index,
+    int_comparison_buffer<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    int_radix_params params, COMPARISON_TYPE op, bool is_signed,
    bool allocate_gpu_memory) {

-  *mem_ptr = new int_comparison_buffer<Torus>(
-      stream, op, params, num_radix_blocks, allocate_gpu_memory);
+  cudaSetDevice(gpu_index);
+  *mem_ptr = new int_comparison_buffer<Torus>(stream, gpu_index, op, params,
+                                              num_radix_blocks, is_signed,
+                                              allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void
-host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count, Torus *lwe_array_out,
                             Torus *lwe_array_left, Torus *lwe_array_right,
                             int_comparison_buffer<Torus> *mem_ptr, void *bsk,
                             Torus *ksk, uint32_t total_num_radix_blocks) {

+  cudaSetDevice(gpu_indexes[0]);
  // Compute the sign
  host_integer_radix_difference_check_kb(
-      stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
-      mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);
+      streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
+      lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsk,
+      ksk, total_num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb(
-      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
+  host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
+                             mem_ptr->tmp_lwe_array_out, lwe_array_left,
+                             lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
+                             total_num_radix_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -0,0 +1,85 @@
+#include "integer/div_rem.cuh"
+
+void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_div_rem_kb<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      allocate_gpu_memory);
+}
+
+void cuda_integer_div_rem_radix_ciphertext_kb_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *quotient,
+    void *remainder, void *numerator, void *divisor, int8_t *mem_ptr, void *bsk,
+    void *ksk, uint32_t num_blocks) {
+
+  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
+
+  switch (mem->params.polynomial_size) {
+  case 512:
+    host_integer_div_rem_kb<uint64_t, Degree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+    break;
+  case 1024:
+
+    host_integer_div_rem_kb<uint64_t, Degree<1024>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+    break;
+  case 2048:
+    host_integer_div_rem_kb<uint64_t, Degree<2048>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+    break;
+  case 4096:
+    host_integer_div_rem_kb<uint64_t, Degree<4096>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+    break;
+  case 8192:
+    host_integer_div_rem_kb<uint64_t, Degree<8192>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+    break;
+  case 16384:
+    host_integer_div_rem_kb<uint64_t, Degree<16384>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(quotient), static_cast<uint64_t *>(remainder),
+        static_cast<uint64_t *>(numerator), static_cast<uint64_t *>(divisor),
+        bsk, static_cast<uint64_t *>(ksk), mem, num_blocks);
+    break;
+  default:
+    PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
+          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
+  }
+}
+
+void cleanup_cuda_integer_div_rem(void *stream, uint32_t gpu_index,
+                                  int8_t **mem_ptr_void) {
+  int_div_rem_memory<uint64_t> *mem_ptr =
+      (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -0,0 +1,624 @@
+#ifndef TFHE_RS_DIV_REM_CUH
+#define TFHE_RS_DIV_REM_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/negation.cuh"
+#include "integer/scalar_shifts.cuh"
+#include "linear_algebra.h"
+#include "programmable_bootstrap.h"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+int ceil_div(int a, int b) { return (a + b - 1) / b; }
+
+// struct makes it easier to use list of ciphertexts and move data between them
+// struct does not allocate or drop any memory,
+// keeps track on number of ciphertexts inside list.
+template <typename Torus> struct lwe_ciphertext_list {
+  Torus *data;
+  size_t max_blocks;
+  size_t len;
+  int_radix_params params;
+
+  size_t big_lwe_size;
+  size_t radix_size;
+  size_t big_lwe_size_bytes;
+  size_t radix_size_bytes;
+  size_t big_lwe_dimension;
+
+  lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
+      : data(src), params(params), max_blocks(max_blocks) {
+    big_lwe_size = params.big_lwe_dimension + 1;
+    big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+    radix_size = max_blocks * big_lwe_size;
+    radix_size_bytes = radix_size * sizeof(Torus);
+    big_lwe_dimension = params.big_lwe_dimension;
+    len = max_blocks;
+  }
+
+  // copies ciphertexts from Torus*, starting from `starting_block` including
+  // `finish_block`, does not change the value of self len
+  void copy_from(Torus *src, size_t start_block, size_t finish_block,
+                 cudaStream_t stream, uint32_t gpu_index) {
+    size_t tmp_len = finish_block - start_block + 1;
+    cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
+                                 tmp_len * big_lwe_size_bytes, stream,
+                                 gpu_index);
+  }
+
+  // copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
+  // including `finish_block`, does not change the value of self len
+  void copy_from(const lwe_ciphertext_list &src, size_t start_block,
+                 size_t finish_block, cudaStream_t stream, uint32_t gpu_index) {
+    copy_from(src.data, start_block, finish_block, stream, gpu_index);
+  }
+
+  // copies ciphertexts from Torus*, starting from `starting_block`
+  // including `finish_block`, updating the value of self len
+  void clone_from(Torus *src, size_t start_block, size_t finish_block,
+                  cudaStream_t stream, uint32_t gpu_index) {
+    len = finish_block - start_block + 1;
+
+    cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
+                                 len * big_lwe_size_bytes, stream, gpu_index);
+  }
+
+  // copies ciphertexts from ciphertexts_list, starting from `starting_block`
+  // including `finish_block`, updating the value of self len
+  void clone_from(const lwe_ciphertext_list &src, size_t start_block,
+                  size_t finish_block, cudaStream_t stream,
+                  uint32_t gpu_index) {
+    clone_from(src.data, start_block, finish_block, stream, gpu_index);
+  }
+
+  // assign zero to blocks starting from `start_block` including `finish_block`
+  void assign_zero(size_t start_block, size_t finish_block, cudaStream_t stream,
+                   uint32_t gpu_index) {
+    auto size = finish_block - start_block + 1;
+    cuda_memset_async(&data[start_block * big_lwe_size], 0,
+                      size * big_lwe_size_bytes, stream, gpu_index);
+  }
+
+  // return pointer to last block
+  Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
+
+  // return pointer to first_block
+  Torus *first_block() { return data; }
+
+  // return block with `index`
+  Torus *get_block(size_t index) {
+    assert(index < len);
+    return &data[index * big_lwe_size];
+  }
+
+  bool is_empty() { return len == 0; }
+
+  // does not dop actual memory from `data`, only reduces value of `len` by one
+  void pop() {
+    if (len > 0)
+      len--;
+    else
+      assert(len > 0);
+  }
+
+  // insert ciphertext at index `ind`
+  void insert(size_t ind, Torus *ciphertext_block, cudaStream_t stream,
+              uint32_t gpu_index) {
+    assert(ind <= len);
+    assert(len < max_blocks);
+
+    size_t insert_offset = ind * big_lwe_size;
+
+    for (size_t i = len; i > ind; i--) {
+      Torus *src = &data[(i - 1) * big_lwe_size];
+      Torus *dst = &data[i * big_lwe_size];
+      cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream,
+                                   gpu_index);
+    }
+
+    cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
+                                 big_lwe_size_bytes, stream, gpu_index);
+    len++;
+  }
+
+  // push ciphertext at the end of `data`
+  void push(Torus *ciphertext_block, cudaStream_t stream, uint32_t gpu_index) {
+    assert(len < max_blocks);
+
+    size_t offset = len * big_lwe_size;
+    cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
+                                 big_lwe_size_bytes, stream, gpu_index);
+    len++;
+  }
+
+  // duplicate ciphertext into `number_of_blocks` ciphertexts
+  void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
+                                 cudaStream_t stream, uint32_t gpu_index) {
+    assert(number_of_blocks <= max_blocks);
+
+    for (size_t i = 0; i < number_of_blocks; i++) {
+      Torus *dest = &data[i * big_lwe_size];
+      cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes, stream,
+                                   gpu_index);
+    }
+
+    len = number_of_blocks;
+  }
+
+  // used for debugging, prints body of each ciphertext.
+  void print_blocks_body(const char *name) {
+    for (int i = 0; i < len; i++) {
+      print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
+    }
+  }
+};
+
+template <typename Torus>
+__host__ void
+scratch_cuda_integer_div_rem_kb(cudaStream_t stream, uint32_t gpu_index,
+                                int_div_rem_memory<Torus> **mem_ptr,
+                                uint32_t num_blocks, int_radix_params params,
+                                bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_div_rem_memory<Torus>(stream, gpu_index, params,
+                                           num_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus, class params>
+__host__ void
+host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *quotient, Torus *remainder,
+                        Torus *numerator, Torus *divisor, void *bsk,
+                        uint64_t *ksk, int_div_rem_memory<uint64_t> *mem_ptr,
+                        uint32_t num_blocks) {
+
+  auto radix_params = mem_ptr->params;
+
+  auto big_lwe_dimension = radix_params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  uint32_t message_modulus = radix_params.message_modulus;
+  uint32_t carry_modulus = radix_params.carry_modulus;
+  uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
+  uint32_t total_bits = num_bits_in_message * num_blocks;
+
+  // put temporary buffers in lwe_ciphertext_list for easy use
+  lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
+                                        num_blocks);
+  lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
+                                        num_blocks);
+  lwe_ciphertext_list<Torus> numerator_block_stack(
+      mem_ptr->numerator_block_stack, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
+                                               radix_params, 1);
+  lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
+                                       num_blocks + 1);
+  lwe_ciphertext_list<Torus> interesting_remainder1(
+      mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
+  lwe_ciphertext_list<Torus> interesting_remainder2(
+      mem_ptr->interesting_remainder2, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
+                                                 radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
+                                               radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
+                                           num_blocks);
+  lwe_ciphertext_list<Torus> subtraction_overflowed(
+      mem_ptr->subtraction_overflowed, radix_params, 1);
+  lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
+                                              radix_params, 1);
+  lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
+                                          1);
+  lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
+                                                radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
+      mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
+  lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
+      mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
+
+  numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, streams[0],
+                                   gpu_indexes[0]);
+  remainder1.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
+  remainder2.assign_zero(0, num_blocks - 1, streams[0], gpu_indexes[0]);
+
+  cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, streams[0],
+                    gpu_indexes[0]);
+
+  for (int i = total_bits - 1; i >= 0; i--) {
+    uint32_t block_of_bit = i / num_bits_in_message;
+    uint32_t pos_in_block = i % num_bits_in_message;
+    uint32_t msb_bit_set = total_bits - 1 - i;
+    uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
+
+    // Index to the first block of the remainder that is fully trivial 0
+    // and all blocks after it are also trivial zeros
+    // This number is in range 1..=num_bocks -1
+    uint32_t first_trivial_block = last_non_trivial_block + 1;
+
+    interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
+                                      streams[0], gpu_indexes[0]);
+    interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
+                                      streams[0], gpu_indexes[0]);
+    interesting_divisor.clone_from(divisor, 0, last_non_trivial_block,
+                                   streams[0], gpu_indexes[0]);
+    divisor_ms_blocks.clone_from(divisor,
+                                 (msb_bit_set + 1) / num_bits_in_message,
+                                 num_blocks - 1, streams[0], gpu_indexes[0]);
+
+    // We split the divisor at a block position, when in reality the split
+    // should be at a bit position meaning that potentially (depending on
+    // msb_bit_set) the split versions share some bits they should not. So we do
+    // one PBS on the last block of the interesting_divisor, and first block of
+    // divisor_ms_blocks to trim out bits which should not be there
+    auto trim_last_interesting_divisor_bits =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          if ((msb_bit_set + 1) % num_bits_in_message == 0) {
+            return;
+          }
+          // The last block of the interesting part of the remainder
+          // can contain bits which we should not account for
+          // we have to zero them out.
+
+          // Where the msb is set in the block
+          uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+
+          // e.g 2 bits in message:
+          // if pos_in_block is 0, then we want to keep only first bit (right
+          // shift
+          // mask by 1) if pos_in_block is 1, then we want to keep the two
+          // bits
+          // (right shift mask by 0)
+          uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
+
+          // Create mask of 1s on the message part, 0s in the carries
+          uint32_t full_message_mask = message_modulus - 1;
+
+          // Shift the mask so that we will only keep bits we should
+          uint32_t shifted_mask = full_message_mask >> shift_amount;
+
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
+              interesting_divisor.last_block(), bsk, ksk, 1,
+              mem_ptr->masking_luts_1[shifted_mask]);
+        }; // trim_last_interesting_divisor_bits
+
+    auto trim_first_divisor_ms_bits =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          if (divisor_ms_blocks.is_empty() ||
+              ((msb_bit_set + 1) % num_bits_in_message) == 0) {
+            return;
+          }
+          // Where the msb is set in the block
+          uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+
+          // e.g 2 bits in message:
+          // if pos_in_block is 0, then we want to discard the first bit (left
+          // shift mask by 1) if pos_in_block is 1, then we want to discard the
+          // two bits (left shift mask by 2) let shift_amount =
+          // num_bits_in_message - pos_in_block
+          uint32_t shift_amount = pos_in_block + 1;
+          uint32_t full_message_mask = message_modulus - 1;
+          uint32_t shifted_mask = full_message_mask << shift_amount;
+
+          // Keep the mask within the range of message bits, so that
+          // the estimated degree of the output is < msg_modulus
+          shifted_mask = shifted_mask & full_message_mask;
+
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
+              divisor_ms_blocks.first_block(), bsk, ksk, 1,
+              mem_ptr->masking_luts_2[shifted_mask]);
+        }; // trim_first_divisor_ms_bits
+
+    // This does
+    //  R := R << 1; R(0) := N(i)
+    //
+    // We could to that by left shifting, R by one, then unchecked_add the
+    // correct numerator bit.
+    //
+    // However, to keep the remainder clean (noise wise), what we do is that we
+    // put the remainder block from which we need to extract the bit, as the LSB
+    // of the Remainder, so that left shifting will pull the bit we need.
+    auto left_shift_interesting_remainder1 =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          numerator_block_1.clone_from(
+              numerator_block_stack, numerator_block_stack.len - 1,
+              numerator_block_stack.len - 1, streams[0], gpu_indexes[0]);
+          numerator_block_stack.pop();
+          interesting_remainder1.insert(0, numerator_block_1.first_block(),
+                                        streams[0], gpu_indexes[0]);
+
+          host_integer_radix_logical_scalar_shift_kb_inplace(
+              streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
+              mem_ptr->shift_mem_1, bsk, ksk, interesting_remainder1.len);
+
+          tmp_radix.clone_from(interesting_remainder1, 0,
+                               interesting_remainder1.len - 1, streams[0],
+                               gpu_indexes[0]);
+
+          host_radix_blocks_rotate_right(
+              streams, gpu_indexes, gpu_count, interesting_remainder1.data,
+              tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
+
+          numerator_block_1.clone_from(
+              interesting_remainder1, interesting_remainder1.len - 1,
+              interesting_remainder1.len - 1, streams[0], gpu_indexes[0]);
+
+          interesting_remainder1.pop();
+
+          if (pos_in_block != 0) {
+            // We have not yet extracted all the bits from this numerator
+            // so, we put it back on the front so that it gets taken next
+            // iteration
+            numerator_block_stack.push(numerator_block_1.first_block(),
+                                       streams[0], gpu_indexes[0]);
+          }
+        }; // left_shift_interesting_remainder1
+
+    auto left_shift_interesting_remainder2 =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          host_integer_radix_logical_scalar_shift_kb_inplace(
+              streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
+              mem_ptr->shift_mem_2, bsk, ksk, interesting_remainder2.len);
+        }; // left_shift_interesting_remainder2
+
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // interesting_divisor
+        trim_last_interesting_divisor_bits(&mem_ptr->sub_stream_1,
+                                           &gpu_indexes[0], 1);
+      }
+#pragma omp section
+      {
+        // divisor_ms_blocks
+        trim_first_divisor_ms_bits(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
+      }
+#pragma omp section
+      {
+        // interesting_remainder1
+        // numerator_block_stack
+        left_shift_interesting_remainder1(&mem_ptr->sub_stream_3,
+                                          &gpu_indexes[0], 1);
+      }
+#pragma omp section
+      {
+        // interesting_remainder2
+        left_shift_interesting_remainder2(&mem_ptr->sub_stream_4,
+                                          &gpu_indexes[0], 1);
+      }
+    }
+    cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
+    cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
+    cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
+    cuda_synchronize_stream(mem_ptr->sub_stream_4, gpu_indexes[0]);
+
+    // if interesting_remainder1 != 0 -> interesting_remainder2 == 0
+    // if interesting_remainder1 == 0 -> interesting_remainder2 != 0
+    // In practice interesting_remainder1 contains the numerator bit,
+    // but in that position, interesting_remainder2 always has a 0
+    auto &merged_interesting_remainder = interesting_remainder1;
+
+    host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
+                  merged_interesting_remainder.data,
+                  interesting_remainder2.data, radix_params.big_lwe_dimension,
+                  merged_interesting_remainder.len);
+
+    // after create_clean_version_of_merged_remainder
+    // `merged_interesting_remainder` will be reused as
+    // `cleaned_merged_interesting_remainder`
+    cleaned_merged_interesting_remainder.clone_from(
+        merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
+        streams[0], gpu_indexes[0]);
+
+    assert(merged_interesting_remainder.len == interesting_divisor.len);
+
+    // `new_remainder` is not initialized yet, so need to set length
+    new_remainder.len = merged_interesting_remainder.len;
+
+    // fills:
+    //  `new_remainder` - radix ciphertext
+    //  `subtraction_overflowed` - single ciphertext
+    auto do_overflowing_sub = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
+                                  uint32_t gpu_count) {
+      host_integer_overflowing_sub_kb<Torus, params>(
+          streams, gpu_indexes, gpu_count, new_remainder.data,
+          subtraction_overflowed.data, merged_interesting_remainder.data,
+          interesting_divisor.data, bsk, ksk, mem_ptr->overflow_sub_mem,
+          merged_interesting_remainder.len);
+    };
+
+    // fills:
+    //  `at_least_one_upper_block_is_non_zero` - single ciphertext
+    auto check_divisor_upper_blocks = [&](cudaStream_t *streams,
+                                          uint32_t *gpu_indexes,
+                                          uint32_t gpu_count) {
+      auto &trivial_blocks = divisor_ms_blocks;
+      if (trivial_blocks.is_empty()) {
+        cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
+                          big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+      } else {
+
+        // We could call unchecked_scalar_ne
+        // But we are in the special case where scalar == 0
+        // So we can skip some stuff
+        host_compare_with_zero_equality(
+            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
+            mem_ptr->comparison_buffer, bsk, ksk, trivial_blocks.len,
+            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
+
+        tmp_1.len =
+            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
+
+        is_at_least_one_comparisons_block_true(
+            streams, gpu_indexes, gpu_count,
+            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
+            mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len);
+      }
+    };
+
+    // Creates a cleaned version (noise wise) of the merged remainder
+    // so that it can be safely used in bivariate PBSes
+    // fills:
+    //  `cleaned_merged_interesting_remainder` - radix ciphertext
+    auto create_clean_version_of_merged_remainder =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          integer_radix_apply_univariate_lookup_table_kb(
+              streams, gpu_indexes, gpu_count,
+              cleaned_merged_interesting_remainder.data,
+              cleaned_merged_interesting_remainder.data, bsk, ksk,
+              cleaned_merged_interesting_remainder.len,
+              mem_ptr->message_extract_lut_1);
+        };
+
+    // phase 2
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // new_remainder
+        // subtraction_overflowed
+        do_overflowing_sub(&mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
+      }
+#pragma omp section
+      {
+        // at_least_one_upper_block_is_non_zero
+        check_divisor_upper_blocks(&mem_ptr->sub_stream_2, &gpu_indexes[0], 1);
+      }
+#pragma omp section
+      {
+        // cleaned_merged_interesting_remainder
+        create_clean_version_of_merged_remainder(&mem_ptr->sub_stream_3,
+                                                 &gpu_indexes[0], 1);
+      }
+    }
+    cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
+    cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
+    cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
+
+    host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
+                  subtraction_overflowed.data,
+                  at_least_one_upper_block_is_non_zero.data,
+                  radix_params.big_lwe_dimension, 1);
+
+    int factor = (i) ? 3 : 2;
+    int factor_lut_id = factor - 2;
+    overflow_sum_radix.fill_with_same_ciphertext(
+        overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
+        streams[0], gpu_indexes[0]);
+
+    auto conditionally_zero_out_merged_interesting_remainder =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+              streams, gpu_indexes, gpu_count,
+              cleaned_merged_interesting_remainder.data,
+              cleaned_merged_interesting_remainder.data,
+              overflow_sum_radix.data, bsk, ksk,
+              cleaned_merged_interesting_remainder.len,
+              mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
+              factor);
+        };
+
+    auto conditionally_zero_out_merged_new_remainder =
+        [&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+              streams, gpu_indexes, gpu_count, new_remainder.data,
+              new_remainder.data, overflow_sum_radix.data, bsk, ksk,
+              new_remainder.len,
+              mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
+        };
+
+    auto set_quotient_bit = [&](cudaStream_t *streams, uint32_t *gpu_indexes,
+                                uint32_t gpu_count) {
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, did_not_overflow.data,
+          subtraction_overflowed.data,
+          at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1,
+          mem_ptr->merge_overflow_flags_luts[pos_in_block],
+          mem_ptr->merge_overflow_flags_luts[pos_in_block]
+              ->params.message_modulus);
+
+      host_addition(streams[0], gpu_indexes[0],
+                    &quotient[block_of_bit * big_lwe_size],
+                    &quotient[block_of_bit * big_lwe_size],
+                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
+    };
+
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // cleaned_merged_interesting_remainder
+        conditionally_zero_out_merged_interesting_remainder(
+            &mem_ptr->sub_stream_1, &gpu_indexes[0], 1);
+      }
+#pragma omp section
+      {
+        // new_remainder
+        conditionally_zero_out_merged_new_remainder(&mem_ptr->sub_stream_2,
+                                                    &gpu_indexes[0], 1);
+      }
+#pragma omp section
+      {
+        // quotient
+        set_quotient_bit(&mem_ptr->sub_stream_3, &gpu_indexes[0], 1);
+      }
+    }
+    cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
+    cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
+    cuda_synchronize_stream(mem_ptr->sub_stream_3, gpu_indexes[0]);
+
+    assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
+    assert(first_trivial_block - 1 == new_remainder.len);
+
+    remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
+                         first_trivial_block - 1, streams[0], gpu_indexes[0]);
+    remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, streams[0],
+                         gpu_indexes[0]);
+  }
+
+  assert(remainder1.len == remainder2.len);
+
+  // Clean the quotient and remainder
+  // as even though they have no carries, they are not at nominal noise level
+  host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                remainder2.data, radix_params.big_lwe_dimension,
+                remainder1.len);
+
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+#pragma omp parallel sections
+  {
+#pragma omp section
+    {
+      integer_radix_apply_univariate_lookup_table_kb(
+          &mem_ptr->sub_stream_1, &gpu_indexes[0], 1, remainder, remainder, bsk,
+          ksk, num_blocks, mem_ptr->message_extract_lut_1);
+    }
+#pragma omp section
+    {
+      integer_radix_apply_univariate_lookup_table_kb(
+          &mem_ptr->sub_stream_2, &gpu_indexes[0], 1, quotient, quotient, bsk,
+          ksk, num_blocks, mem_ptr->message_extract_lut_2);
+    }
+  }
+  cuda_synchronize_stream(mem_ptr->sub_stream_1, gpu_indexes[0]);
+  cuda_synchronize_stream(mem_ptr->sub_stream_2, gpu_indexes[0]);
+}
+
+#endif // TFHE_RS_DIV_REM_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -2,98 +2,121 @@
 #include <linear_algebra.h>

 void cuda_full_propagation_64_inplace(
-    cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
-    void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
-    uint32_t num_blocks) {
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *input_blocks, int8_t *mem_ptr, void *ksk, void *bsk,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t ks_base_log, uint32_t ks_level, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_blocks) {

  switch (polynomial_size) {
  case 256:
    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
-        stream, static_cast<uint64_t *>(input_blocks),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(input_blocks),
        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  case 512:
    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(input_blocks),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(input_blocks),
        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  case 1024:
    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(input_blocks),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(input_blocks),
        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  case 2048:
    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(input_blocks),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(input_blocks),
        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  case 4096:
    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(input_blocks),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(input_blocks),
        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  case 8192:
    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(input_blocks),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(input_blocks),
        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  case 16384:
    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(input_blocks),
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+        static_cast<uint64_t *>(input_blocks),
        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  default:
-    break;
+    PANIC("Cuda error (full propagation inplace): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..16384].")
  }
 }

 void scratch_cuda_full_propagation_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
+    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory) {

  scratch_cuda_full_propagation<uint64_t>(
-      stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
-      glwe_dimension, polynomial_size, level_count, grouping_factor,
-      input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
-      allocate_gpu_memory);
+      static_cast<cudaStream_t>(stream), gpu_index,
+      (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension, glwe_dimension,
+      polynomial_size, level_count, grouping_factor, input_lwe_ciphertext_count,
+      message_modulus, carry_modulus, pbs_type, allocate_gpu_memory);
 }

-void cleanup_cuda_full_propagation(cuda_stream_t *stream,
+void cleanup_cuda_full_propagation(void *stream, uint32_t gpu_index,
                                   int8_t **mem_ptr_void) {

  int_fullprop_buffer<uint64_t> *mem_ptr =
      (int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
+  auto s = static_cast<cudaStream_t>(stream);

-  cuda_drop_async(mem_ptr->lut_buffer, stream);
-  cuda_drop_async(mem_ptr->lut_indexes, stream);
+  cuda_drop_async(mem_ptr->lut_buffer, s, gpu_index);
+  cuda_drop_async(mem_ptr->lut_indexes, s, gpu_index);

-  cuda_drop_async(mem_ptr->pbs_buffer, stream);
+  cuda_drop_async(mem_ptr->lwe_indexes, s, gpu_index);

-  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
-  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
+  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, s, gpu_index);
+  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, s, gpu_index);
+
+  switch (mem_ptr->pbs_type) {
+  case CLASSICAL: {
+    auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(mem_ptr->pbs_buffer);
+    x->release(s, gpu_index);
+  } break;
+  case MULTI_BIT: {
+    auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
+    x->release(s, gpu_index);
+  } break;
+  default:
+    PANIC("Cuda error (PBS): unsupported implementation variant.")
+  }
 }

-void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+void scratch_cuda_propagate_single_carry_kb_64_inplace(
+    void *stream, uint32_t gpu_index, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
@@ -105,23 +128,64 @@ void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus);

-  scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
-      stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+  scratch_cuda_propagate_single_carry_kb_inplace(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }

-void cuda_propagate_single_carry_low_latency_kb_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t num_blocks) {
-  host_propagate_single_carry_low_latency<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array),
+void cuda_propagate_single_carry_kb_64_inplace(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
+    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
+  host_propagate_single_carry<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array),
      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
      static_cast<uint64_t *>(ksk), num_blocks);
 }

-void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
-                                                     int8_t **mem_ptr_void) {
+void cleanup_cuda_propagate_single_carry(void *stream, uint32_t gpu_index,
+                                         int8_t **mem_ptr_void) {
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
+  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
+}
+
+void scratch_cuda_apply_univariate_lut_kb_64(
+    void *stream, uint32_t gpu_index, int8_t **mem_ptr, void *input_lut,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_apply_univariate_lut_kb<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
+      num_radix_blocks, params, allocate_gpu_memory);
+}
+
+void cuda_apply_univariate_lut_kb_64(void **streams, uint32_t *gpu_indexes,
+                                     uint32_t gpu_count, void *output_radix_lwe,
+                                     void *input_radix_lwe, int8_t *mem_ptr,
+                                     void *ksk, void *bsk,
+                                     uint32_t num_blocks) {
+
+  host_apply_univariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk), bsk,
+      num_blocks);
+}
+
+void cleanup_cuda_apply_univariate_lut_kb_64(void *stream, uint32_t gpu_index,
+                                             int8_t **mem_ptr_void) {
+  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
--- a/Show More
+++ b/Show More