GITBOOK-5: Update TOC

GITBOOK-4: V2 design details
GITBOOK-3: correct a typo
2026-01-13 08:38:03 -05:00 · 2024-03-05 14:32:28 +00:00 · 2024-02-28 15:27:05 +00:00 · 2024-02-28 14:54:38 +00:00 · 2024-02-28 14:23:50 +00:00 · 2024-02-28 14:11:06 +00:00
847 changed files with 36843 additions and 96931 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,6 +1,6 @@
 ---
 name: Bug report
-about: Report a problem with TFHE-rs
+about: Report a problem with concrete
 title: ''
 labels: triage_required
 assignees: ''
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,6 +1,6 @@
 ---
 name: Feature request
-about: Suggest an idea for TFHE-rs
+about: Suggest an idea for concrete
 title: ''
 labels: feature_request
 assignees: ''
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -1,9 +0,0 @@
-self-hosted-runner:
-  # Labels of self-hosted runner in array of strings.
-  labels:
-    - m1mac
-    - 4090-desktop
-# Configuration variables in array of strings defined in your repository or
-# organization. `null` means disabling configuration variables check.
-# Empty array means no configuration variable is allowed.
-config-variables: null
--- a/.github/workflows/approve_label.yml
+++ b/.github/workflows/approve_label.yml
@@ -1,34 +0,0 @@
-# Manage approved label in pull request
-name: PR approved label manager
-
-on:
-  pull_request:
-  pull_request_review:
-    types: [submitted]
-
-jobs:
-  trigger-tests:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: write
-    steps:
-      - name: Get current labels
-        uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
-
-      # Remove label if a push is performed after an approval
-      - name: Remove approved label
-        if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
-        uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        with:
-          # We use a PAT to have the same user (zama-bot) for label deletion as for creation.
-          github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-          labels: approved
-
-      # Add label only if the review is approved and if the label doesn't already exist
-      - name: Add approved label
-        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
-        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
-        with:
-          # We need to use a PAT to be able to trigger `labeled` event for the other workflow.
-          github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-          labels: approved
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -6,52 +6,63 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (fast-tests)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
  fast-tests:
-    name: Fast CPU tests
-    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable

@@ -59,10 +70,6 @@ jobs:
        run: |
          make test_concrete_csprng

-      - name: Run tfhe-zk-pok tests
-        run: |
-          make test_zk_pok
-
      - name: Run core tests
        run: |
          AVX512_SUPPORT=ON make test_core_crypto
@@ -110,31 +117,11 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (fast-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, fast-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -1,75 +0,0 @@
-# Compile and test tfhe-cuda-backend on an RTX 4090 machine
-name: TFHE Cuda Backend - 4090 full tests
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [labeled]
-
-jobs:
-  cuda-tests-linux:
-    name: CUDA tests (RTX 4090)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, '4090_test') }}
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: ["self-hosted", "4090-desktop"]
-
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
-        with:
-          toolchain: stable
-
-      - name: Run fmt checks
-        run: |
-          make check_fmt_gpu
-
-      - name: Run clippy checks
-        run: |
-          make pcc_gpu
-
-      - name: Run core crypto, integer and internal CUDA backend tests
-        run: |
-          make test_gpu
-
-      - name: Run user docs tests
-        run: |
-          make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          make test_c_api_gpu
-
-      - name: Run High Level API Tests
-        run: |
-          make test_high_level_api_gpu
-
-      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        if: ${{ always() && github.event_name == 'pull_request' }}
-        with:
-          labels: 4090_test
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -1,47 +1,46 @@
-# Compile and test tfhe-cuda-backend on an AWS instance
-name: TFHE Cuda Backend - Full tests
+# Compile and test Concrete-cuda on an AWS instance
+name: Concrete Cuda - Full tests

 env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cuda-tests)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: gpu-test
-
-  cuda-tests-linux:
-    name: CUDA tests
-    needs: setup-ec2
+  run-cuda-tests-linux:
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: tfhe_cuda_backend_test-${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    name: Test code in EC2
+    runs-on: ${{ inputs.runner_name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -54,8 +53,21 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}

    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
@@ -78,22 +90,16 @@ jobs:
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Run fmt checks
-        run: |
-          make check_fmt_gpu
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Run clippy checks
        run: |
-          make pcc_gpu
+          make clippy_gpu

-      - name: Run core crypto, integer and internal CUDA backend tests
+      - name: Run all tests
        run: |
          make test_gpu

@@ -104,39 +110,3 @@ jobs:
      - name: Test C API
        run: |
          make test_c_api_gpu
-
-      - name: Run High Level API Tests
-        run: |
-          make test_high_level_api_gpu
-
-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cuda-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cuda-tests-linux ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -5,54 +5,63 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+      fork_repo:
+        description: "Name of forked repo as user/repo"
+        type: string
+      fork_git_sha:
+        description: "Git SHA to checkout from fork"
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (unsigned-integer-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-  unsigned-integer-tests:
-    name: Unsigned integer tests
-    needs: setup-ec2
+  integer-tests:
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable

@@ -75,31 +84,11 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (unsigned-integer-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, unsigned-integer-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -5,54 +5,63 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+      fork_repo:
+        description: "Name of forked repo as user/repo"
+        type: string
+      fork_git_sha:
+        description: "Git SHA to checkout from fork"
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (signed-integer-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-  signed-integer-tests:
-    name: Signed integer tests
-    needs: setup-ec2
+  multi-bit-tests:
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable

@@ -79,31 +88,11 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (signed-integer-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, signed-integer-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -5,54 +5,63 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cpu-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-big
-
-  cpu-tests:
-    name: CPU tests
-    needs: setup-ec2
+  shortint-tests:
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable

@@ -60,10 +69,6 @@ jobs:
        run: |
          make test_concrete_csprng

-      - name: Run tfhe-zk-pok tests
-        run: |
-          make test_zk_pok
-
      - name: Run core tests
        run: |
          AVX512_SUPPORT=ON make test_core_crypto
@@ -105,31 +110,11 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cpu-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cpu-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -5,101 +5,83 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (wasm-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
  wasm-tests:
-    name: WASM tests
-    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable

-      - name: Install Node
-        run: |
-          make install_node
-
-      - name: Run fmt checks
-        run: |
-          make check_fmt_js
-
      - name: Run js on wasm API tests
        run: |
          make test_nodejs_wasm_api_in_docker

      - name: Run parallel wasm tests
        run: |
+          make install_node
          make ci_test_web_js_api_parallel

      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (wasm-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, wasm-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -33,7 +33,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  run-boolean-benchmarks:
@@ -53,7 +52,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -63,13 +62,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Run benchmarks with AVX512
        run: |
-          make bench_boolean
+          make AVX512_SUPPORT=ON bench_boolean

      - name: Parse results
        run: |
@@ -97,17 +96,17 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -126,11 +125,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Boolean benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -7,7 +7,6 @@ env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -23,7 +22,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11

      - name: Install and run newline linter checks
        if: matrix.os == 'ubuntu-latest'
@@ -68,9 +67,5 @@ jobs:
        run: |
          make build_c_api

-      - name: Build coverage tests
-        run: |
-          make build_tfhe_coverage
-
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -1,27 +0,0 @@
-# Lint and check CI
-name: CI Lint and Checks
-
-on:
-  pull_request:
-
-env:
-  ACTIONLINT_VERSION: 1.6.27
-
-jobs:
-  lint-check:
-    name: Lint and checks
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-
-      - name: Get actionlint
-        run: |
-          bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
-          echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782  actionlint" > checksum
-          sha256sum -c checksum
-          ln -s "$(pwd)/actionlint" /usr/local/bin/
-
-      - name: Lint workflows
-        run: |
-          make lint_workflow
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -5,7 +5,6 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -40,7 +39,7 @@ jobs:
      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
    runs-on: ${{ inputs.runner_name }}
-    timeout-minutes: 11520 # 8 days
+    timeout-minutes: 1080
    steps:
      # Step used for log purpose.
      - name: Instance configuration used
@@ -53,7 +52,7 @@ jobs:
          echo "Fork git sha: ${{ inputs.fork_git_sha }}"

      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: ${{ inputs.fork_repo }}
          ref: ${{ inputs.fork_git_sha }}
@@ -63,13 +62,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
+        uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
        with:
          files_yaml: |
            tfhe:
@@ -99,7 +98,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
+        uses: codecov/codecov-action@4fe8c5f003fae66aa5ebb77cfd3e7bfbbda0b6b0
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -107,24 +106,10 @@ jobs:
          fail_ci_if_error: true
          files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml

-      - name: Run integer coverage
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
-        run: |
-          make test_integer_cov
-
-      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          directory: ./coverage/
-          fail_ci_if_error: true
-          files: integer/cobertura.xml
-
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -1,182 +0,0 @@
-# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
-name: Core crypto GPU benchmarks
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-
-jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cuda-benchmarks)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: gpu-bench
-
-  core-crypto-benchmarks:
-    name: CUDA core crypto benchmarks
-    needs: setup-ec2
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
-
-    steps:
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make bench_pbs_gpu
-          make bench_ks_gpu
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --name-suffix avx512 \
-          --walk-subdirs \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_core_crypto
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-  # FIXME This action needs docker to be installed on the machine beforehand.
-#      - name: Slack Notification
-#        if: ${{ failure() }}
-#        continue-on-error: true
-#        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-#        env:
-#          SLACK_COLOR: ${{ job.status }}
-#          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-#          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-#          SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-#          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-#          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cuda-benchmarks)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, core-crypto-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cuda-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/csprng_randomness_testing.yml
+++ b/.github/workflows/csprng_randomness_testing.yml
@@ -0,0 +1,74 @@
+name: CSPRNG randomness testing Workflow
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string
+
+jobs:
+  csprng-randomness-teting:
+    name: CSPRNG randomness testing
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
+      cancel-in-progress: true
+    runs-on: ${{ inputs.runner_name }}
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: stable
+
+      - name: Dieharder randomness test suite
+        run: |
+          make dieharder_csprng
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -1,94 +0,0 @@
-name: CSPRNG randomness testing Workflow
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
-
-
-jobs:
-  setup-ec2:
-    name: Setup EC2 instance (csprng-randomness-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: cpu-small
-
-  csprng-randomness-tests:
-    name: CSPRNG randomness tests
-    needs: setup-ec2
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-
-      - name: Set up home
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
-        with:
-          toolchain: stable
-
-      - name: Dieharder randomness test suite
-        run: |
-          make dieharder_csprng
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (csprng-randomness-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, csprng-randomness-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -1,202 +0,0 @@
-# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
-name: TFHE Cuda Backend - 4090 full benchmarks
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [labeled]
-  schedule:
-    # Weekly benchmarks will be triggered each Friday at 9p.m.
-    - cron: "0 21 * * 5"
-
-jobs:
-  cuda-integer-benchmarks:
-    name: Cuda integer benchmarks for all operations flavor  (RTX 4090)
-    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
-      cancel-in-progress: true
-    runs-on: ["self-hosted", "4090-desktop"]
-    timeout-minutes: 1440 # 24 hours
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]
-
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          fetch-depth: 0
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Run integer benchmarks
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "rtx4090" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  cuda-core-crypto-benchmarks:
-    name: Cuda core crypto benchmarks  (RTX 4090)
-    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
-    needs: cuda-integer-benchmarks
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
-      cancel-in-progress: true
-    runs-on: ["self-hosted", "4090-desktop"]
-    timeout-minutes: 1440 # 24 hours
-
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          fetch-depth: 0
-
-      - name: Get benchmark details
-        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Run integer benchmarks
-        run: |
-          make bench_pbs_gpu
-          make bench_ks_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "rtx4090" \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_core_crypto
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ !success() && !cancelled() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  remove_github_label:
-    name: Remove 4090 bench label
-    if: ${{ always() && github.event_name == 'pull_request' }}
-    needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
-    runs-on: ["self-hosted", "4090-desktop"]
-    steps:
-      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        with:
-          labels: 4090_bench
-          github_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -26,7 +26,6 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -56,13 +55,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Run benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE bench_integer
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer

      - name: Parse benchmarks to csv
        run: |
@@ -70,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -120,11 +119,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -29,7 +29,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  prepare-matrix:
@@ -41,17 +40,17 @@ jobs:
      - name: Weekly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
+          echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}

      - name: Quarterly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
+          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}

      -  name: Set operation flavor output
         id: set_op_flavor
         run: |
-          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
+          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> ${GITHUB_OUTPUT}

  integer-benchmarks:
    name: Execute integer benchmarks for all operations flavor
@@ -74,17 +73,15 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
+          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -92,20 +89,20 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
+          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}

      - name: Parse results
        run: |
@@ -121,7 +118,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -148,11 +145,11 @@ jobs:
    steps:
      - name: Notify
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -2,9 +2,23 @@
 name: Integer GPU benchmarks

 on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string

 env:
  CARGO_TERM_COLOR: always
@@ -12,30 +26,12 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cuda-benchmarks)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: gpu-bench
-
-  cuda-integer-benchmarks:
-    name: CUDA integer benchmarks
-    needs: setup-ec2
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+  run-integer-benchmarks:
+    name: Execute integer benchmarks in EC2
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -43,29 +39,23 @@ jobs:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 11
+            gcc: 9
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
-
    steps:
-      - name: Install dependencies
+      - name: Instance configuration used
        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"

      - name: Get benchmark date
        run: |
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -75,33 +65,30 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Run benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu

      - name: Parse benchmarks to csv
        run: |
@@ -109,7 +96,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -120,7 +107,7 @@ jobs:
          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "n2-H100x1" \
+          --hardware ${{ inputs.instance_type }} \
          --backend gpu \
          --project-version "${COMMIT_HASH}" \
          --branch ${{ github.ref_name }} \
@@ -131,17 +118,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -157,39 +144,14 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

-# FIXME This action needs docker to be installed on the machine beforehand.
-#      - name: Slack Notification
-#        if: ${{ !success() && !cancelled() }}
-#        continue-on-error: true
-#        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-#        env:
-#          SLACK_COLOR: ${{ job.status }}
-#          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-#          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-#          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-#          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-#          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cuda-benchmarks)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cuda-integer-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cuda-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -2,80 +2,75 @@
 name: Integer GPU full benchmarks

 on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+      # This input is not used in this workflow but still mandatory since a calling workflow could
+      # use it. If a triggering command include a user_inputs field, then the triggered workflow
+      # must include this very input, otherwise the workflow won't be called.
+      # See start_full_benchmarks.yml as example.
+      user_inputs:
+        description: "Type of benchmarks to run"
+        type: string
+        default: "weekly_benchmarks"

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cuda-full-benchmarks)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: gpu-bench
-
-  cuda-integer-full-benchmarks:
-    name: CUDA integer full benchmarks
-    needs: setup-ec2
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
+  integer-benchmarks:
+    name: Execute integer benchmarks for all operations flavor
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
    continue-on-error: true
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
-        command: [integer, integer_multi_bit]
-        op_flavor: [default, unchecked]
+        command: [ integer, integer_multi_bit]
+        op_flavor: [ default, unchecked ]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
-            gcc: 11
+            gcc: 9
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
-
    steps:
-      - name: Install dependencies
+      - name: Instance configuration used
        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
+          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -83,46 +78,43 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu

      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "n2-H100x1" \
+          --hardware ${{ inputs.instance_type }} \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
@@ -133,7 +125,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -152,39 +144,19 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

-  # FIXME This action needs docker to be installed on the machine beforehand.
-  #      - name: Slack Notification
-  #        if: ${{ !success() && !cancelled() }}
-  #        continue-on-error: true
-  #        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-  #        env:
-  #          SLACK_COLOR: ${{ job.status }}
-  #          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  #          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  #          SLACK_MESSAGE: "Integer GPU full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-  #          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  #          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cuda-full-benchmarks)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cuda-integer-full-benchmarks ]
-    runs-on: ubuntu-latest
+  slack-notification:
+    name: Slack Notification
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ failure() }}
+    needs: integer-benchmarks
    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
+      - name: Notify
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cuda-full-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -26,7 +26,6 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -56,13 +55,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE bench_integer_multi_bit
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer_multi_bit

      - name: Parse benchmarks to csv
        run: |
@@ -70,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -120,11 +119,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -1,10 +1,24 @@
 # Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
-name: Integer GPU Multi-bit benchmarks
+name: Integer Multi-bit benchmarks

 on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string

 env:
  CARGO_TERM_COLOR: always
@@ -12,61 +26,37 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cuda-multi-bit-benchmarks)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: gpu-bench
-
-  cuda-integer-multi-bit-benchmarks:
-    name: CUDA integer multi-bit benchmarks
-    needs: setup-ec2
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    timeout-minutes: 1440 # 24 hours
+  run-integer-benchmarks:
+    name: Execute integer multi-bit benchmarks in EC2
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 11
+            cuda: "11.8"
+            cuda_arch: "70"
+            gcc: 9
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.1
-
    steps:
-      - name: Install dependencies
+      - name: Instance configuration used
        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"

      - name: Get benchmark date
        run: |
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -76,33 +66,30 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu

      - name: Parse benchmarks to csv
        run: |
@@ -110,7 +97,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -121,7 +108,7 @@ jobs:
          COMMIT_HASH="$(git describe --tags --dirty)"
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
-          --hardware "n2-H100x1" \
+          --hardware ${{ inputs.instance_type }} \
          --backend gpu \
          --project-version "${COMMIT_HASH}" \
          --branch ${{ github.ref_name }} \
@@ -132,17 +119,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -158,39 +145,14 @@ jobs:
          -d @${{ env.RESULTS_FILENAME }} \
          ${{ secrets.SLAB_URL }}

-# FIXME This action needs docker to be installed on the machine beforehand.
-#      - name: Slack Notification
-#        if: ${{ !success() && !cancelled() }}
-#        continue-on-error: true
-#        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-#        env:
-#          SLACK_COLOR: ${{ job.status }}
-#          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-#          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-#          SLACK_MESSAGE: "Integer GPU benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-#          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-#          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cuda-multi-bit-benchmarks)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cuda-integer-multi-bit-benchmarks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL_PRE_PROD }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cuda-multi-bit-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -15,7 +15,6 @@ env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  FAST_TESTS: "TRUE"

@@ -27,14 +26,12 @@ jobs:
  cargo-builds:
    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
    runs-on: ["self-hosted", "m1mac"]
-    # 12 hours, default is 6 hours, hopefully this is more than enough
-    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable

@@ -74,10 +71,6 @@ jobs:
        run: |
          make test_concrete_csprng

-      - name: Run tfhe-zk-pok tests
-        run: |
-          make test_zk_pok
-
      - name: Run core tests
        run: |
          make test_core_crypto
@@ -137,7 +130,7 @@ jobs:
      - name: Slack Notification
        if: ${{ needs.cargo-builds.result != 'skipped' }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ needs.cargo-builds.result }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -30,7 +30,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -49,7 +49,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
+        uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
@@ -65,7 +65,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
+        uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
@@ -74,7 +74,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -32,7 +32,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -1,129 +0,0 @@
-# Publish new release of tfhe-cuda-backend on crates.io.
-name: Publish CUDA release
-
-on:
-  workflow_dispatch:
-    inputs:
-      dry_run:
-        description: "Dry-run"
-        type: boolean
-        default: true
-      push_to_crates:
-        description: "Push to crate"
-        type: boolean
-        default: true
-
-env:
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-jobs:
-  setup-ec2:
-    name: Setup EC2 instance (publish-cuda-release)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: gpu-test
-
-  publish-cuda-release:
-    name: Publish CUDA Release
-    needs: setup-ec2
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 9
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
-        with:
-          toolchain: stable
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Publish crate.io package
-        if: ${{ inputs.push_to_crates }}
-        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
-          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
-        run: |
-          cargo publish -p tfhe-cuda-backend --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (publish-release)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, publish-cuda-release ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@1dced74825027fe3d481392163ed8fc56813fb5d
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -17,14 +17,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11

      - name: Checkout lattice-estimator
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
-          ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'

      - name: Install Sage
        run: |
@@ -42,7 +41,7 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -1,5 +1,5 @@
-# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Core crypto benchmarks
+# Run PBS benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: PBS benchmarks

 on:
  workflow_dispatch:
@@ -33,11 +33,10 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
-  run-core-crypto-benchmarks:
-    name: Execute core crypto benchmarks in EC2
+  run-pbs-benchmarks:
+    name: Execute PBS benchmarks in EC2
    runs-on: ${{ github.event.inputs.runner_name }}
    if: ${{ !cancelled() }}
    steps:
@@ -53,7 +52,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -63,14 +62,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Run benchmarks with AVX512
        run: |
-          make bench_pbs
-          make bench_ks
+          make AVX512_SUPPORT=ON bench_pbs

      - name: Parse results
        run: |
@@ -88,17 +86,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
-          name: ${{ github.sha }}_core_crypto
+          name: ${{ github.sha }}_pbs
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -117,11 +115,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "PBS benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/pbs_gpu_benchmark.yml
+++ b/.github/workflows/pbs_gpu_benchmark.yml
@@ -0,0 +1,142 @@
+# Run PBS benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+name: PBS GPU benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+      # This input is not used in this workflow but still mandatory since a calling workflow could
+      # use it. If a triggering command include a user_inputs field, then the triggered workflow
+      # must include this very input, otherwise the workflow won't be called.
+      # See start_full_benchmarks.yml as example.
+      user_inputs:
+        description: "Type of benchmarks to run"
+        type: string
+        default: "weekly_benchmarks"
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  run-pbs-benchmarks:
+    name: Execute PBS benchmarks in EC2
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
+    steps:
+      - name: Instance configuration used
+        run: |
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+
+      - name: Get benchmark date
+        run: |
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make AVX512_SUPPORT=ON bench_pbs_gpu
+
+      - name: Parse results
+        run: |
+          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
+          COMMIT_HASH="$(git describe --tags --dirty)"
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware ${{ inputs.instance_type }} \
+          --backend gpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512 \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_pbs
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "PBS GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -25,7 +25,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  run-shortint-benchmarks:
@@ -45,7 +44,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -55,13 +54,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Run benchmarks with AVX512
        run: |
-          make bench_shortint
+          make AVX512_SUPPORT=ON bench_shortint

      - name: Parse results
        run: |
@@ -89,17 +88,17 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_shortint
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -118,11 +117,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Shortint benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Shortint benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -33,7 +33,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  shortint-benchmarks:
@@ -53,17 +52,15 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
+          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -71,20 +68,20 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
+          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint

      - name: Parse results
        run: |
@@ -115,7 +112,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -142,11 +139,11 @@ jobs:
    steps:
      - name: Notify
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Shortint full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -26,7 +26,6 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -56,13 +55,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Run benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE bench_signed_integer
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer

      - name: Parse benchmarks to csv
        run: |
@@ -70,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -120,11 +119,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -29,7 +29,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  integer-benchmarks:
@@ -52,17 +51,15 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

      - name: Get benchmark details
        run: |
-          {
-            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
-            echo "COMMIT_HASH=$(git describe --tags --dirty)";
-          } >> "${GITHUB_ENV}"
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
+          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -70,20 +67,20 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
+          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}

      - name: Parse results
        run: |
@@ -99,7 +96,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -126,11 +123,11 @@ jobs:
    steps:
      - name: Notify
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -26,7 +26,6 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -46,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -56,13 +55,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE bench_signed_integer_multi_bit
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer_multi_bit

      - name: Parse benchmarks to csv
        run: |
@@ -70,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,17 +90,17 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -120,11 +119,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -32,12 +32,12 @@ on:
        description: "Run signed integer multi bit benches"
        type: boolean
        default: true
-      core_crypto_bench:
-        description: "Run core crypto benches"
+      pbs_bench:
+        description: "Run PBS benches"
        type: boolean
        default: true
-      core_crypto_gpu_bench:
-        description: "Run core crypto benches on GPU"
+      pbs_gpu_bench:
+        description: "Run PBS benches on GPU"
        type: boolean
        default: true
      wasm_client_bench:
@@ -54,17 +54,17 @@ jobs:
                   integer_bench, integer_multi_bit_bench,
                   signed_integer_bench, signed_integer_multi_bit_bench,
                   integer_gpu_bench, integer_multi_bit_gpu_bench,
-                   core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
+                   pbs_bench, pbs_gpu_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
+        uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
        with:
          files_yaml: |
            common_benches:
@@ -102,20 +102,20 @@ jobs:
              - tfhe/src/integer/**
              - tfhe/benches/integer/signed_bench.rs
              - .github/workflows/signed_integer_multi_bit_benchmark.yml
-            core_crypto_bench:
+            pbs_bench:
              - tfhe/src/core_crypto/**
              - tfhe/benches/core_crypto/**
-              - .github/workflows/core_crypto_benchmark.yml
+              - .github/workflows/pbs_benchmark.yml
            wasm_client_bench:
              - tfhe/web_wasm_parallel_tests/**
              - .github/workflows/wasm_client_benchmark.yml

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Start AWS job in Slab
        # If manually triggered check that the current bench has been requested
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -26,20 +26,20 @@ jobs:
      matrix:
        command: [ boolean_bench, shortint_full_bench,
                   integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
-                   core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
+                   pbs_bench, pbs_gpu_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Set benchmarks type as weekly
        if: (github.event_name == 'workflow_dispatch' && inputs.benchmark_type == 'weekly') || github.event.schedule == '0 1 * * 6'
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -13,11 +13,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0
      - name: Save repo
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: repo-archive
          path: '.'
@@ -26,12 +26,12 @@ jobs:
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "main"
-          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
+          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.CONCRETE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
          destination_branch: "main"
      - name: git-sync tags
        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "refs/tags/*"
-          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.FHE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
+          destination_repo: "https://${{ secrets.BOT_USERNAME }}:${{ secrets.CONCRETE_ACTIONS_TOKEN }}@github.com/${{ secrets.SYNC_DEST_REPO }}"
          destination_branch: "refs/tags/*"
--- a/.github/workflows/trigger_aws_tests_on_pr.yml
+++ b/.github/workflows/trigger_aws_tests_on_pr.yml
@@ -0,0 +1,55 @@
+# Trigger an AWS build each time commits are pushed to a pull request.
+name: PR AWS build trigger
+
+on:
+  pull_request:
+  pull_request_review:
+    types: [submitted]
+
+jobs:
+  trigger-tests:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Get current labels
+        uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
+
+      - name: Remove approved label
+        if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
+        uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          labels: approved
+
+      - name: Launch fast tests
+        if: ${{ github.event_name == 'pull_request' }}
+        uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
+        with:
+          allow-repeats: true
+          message: |
+            @slab-ci cpu_fast_test
+            @slab-ci gpu_test
+
+      - name: Add approved label
+        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
+        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          labels: approved
+
+      # PR label 'approved' presence is checked to avoid running the full test suite several times
+      # in case of multiple approvals without new commits in between.
+      - name: Launch full tests suite
+        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
+        uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
+        with:
+          allow-repeats: true
+          message: |
+            Pull Request has been approved :tada:
+            Launching full test suite...
+            @slab-ci cpu_test
+            @slab-ci cpu_unsigned_integer_test
+            @slab-ci cpu_signed_integer_test
+            @slab-ci cpu_wasm_test
+            @slab-ci csprng_randomness_testing
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -33,7 +33,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"

 jobs:
  run-wasm-client-benchmarks:
@@ -53,7 +52,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          fetch-depth: 0

@@ -63,7 +62,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly

@@ -98,17 +97,17 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
        with:
          repository: zama-ai/slab
          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
@@ -127,11 +126,11 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "WASM benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "WASM benchmarks failed. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,3 @@ dieharder_run.log

 # Coverage reports
 /coverage/
-
-# Cuda local build
-backends/tfhe-cuda-backend/cuda/cmake-build-debug/
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,13 +1,6 @@
 [workspace]
 resolver = "2"
-members = [
-    "tfhe",
-    "tfhe-zk-pok",
-    "tasks",
-    "apps/trivium",
-    "concrete-csprng",
-    "backends/tfhe-cuda-backend",
-]
+members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng"]

 [profile.bench]
 lto = "fat"
@@ -24,4 +17,3 @@ lto = "off"
 inherits = "dev"
 opt-level = 3
 lto = "off"
-debug-assertions = false
--- a/225
+++ b/225
@@ -3,7 +3,6 @@ OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
 TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
-CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
 CARGO_PROFILE?=release
@@ -62,7 +61,7 @@ REGEX_STRING?=''
 REGEX_PATTERN?=''

 # tfhe-cuda-backend
-TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
+TFHECUDA_SRC="backends/tfhe-cuda-backend/cuda"
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

 # Exclude these files from coverage reports
@@ -120,12 +119,7 @@ install_wasm_pack: install_rs_build_toolchain

 .PHONY: install_node # Install last version of NodeJS via nvm
 install_node:
-	curl -o nvm_install.sh https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh
-	@echo "2ed5e94ba12434370f0358800deb69f514e8bce90f13beb0e1b241d42c6abafd nvm_install.sh" > nvm_checksum
-	@sha256sum -c nvm_checksum
-	@rm nvm_checksum
-	$(SHELL) nvm_install.sh
-	@rm nvm_install.sh
+	curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | $(SHELL)
 	source ~/.bashrc
 	$(SHELL) -i -c 'nvm install $(NODE_VERSION)' || \
 	( echo "Unable to install node, unknown error." && exit 1 )
@@ -150,61 +144,23 @@ check_linelint_installed:
 	@printf "\n" | linelint - > /dev/null 2>&1 || \
 	( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )

-.PHONY: check_actionlint_installed # Check if actionlint workflow linter is installed
-check_actionlint_installed:
-	@actionlint --version > /dev/null 2>&1 || \
-	( echo "Unable to locate actionlint. Try installing it: https://github.com/rhysd/actionlint/releases" && exit 1 )
-
-.PHONY: check_nvm_installed # Check if Node Version Manager is installed
-check_nvm_installed:
-	@source ~/.nvm/nvm.sh && nvm --version > /dev/null 2>&1 || \
-	( echo "Unable to locate Node. Run 'make install_node'" && exit 1 )
-
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt

-.PHONY: fmt_js # Format javascript code
-fmt_js: check_nvm_installed
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests fmt
-
 .PHONY: fmt_gpu # Format rust and cuda code
 fmt_gpu: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
 	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh

-.PHONY: fmt_c_tests # Format c tests
-fmt_c_tests:
-	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format -style=file -i {} \;
-
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check

-.PHONY: check_fmt_c_tests  # Check C tests format
-check_fmt_c_tests:
-	find tfhe/c_api_tests/ -regex '.*\.\(cpp\|hpp\|cu\|c\|h\)' -exec clang-format --dry-run --Werror -style=file {} \;
-
-.PHONY: check_fmt_gpu # Check rust and cuda code format
-check_fmt_gpu: install_rs_check_toolchain
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
-	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
-
-.PHONY: check_fmt_js # Check javascript code format
-check_fmt_js: check_nvm_installed
-	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
-	$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt
-
-.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
+.PHONY: clippy_gpu # Run clippy lints on the gpu backend
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
-		--all-targets \
+		--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
@@ -215,10 +171,6 @@ fix_newline: check_linelint_installed
 check_newline: check_linelint_installed
 	linelint .

-.PHONY: lint_workflow # Run static linter on GitHub workflows
-lint_workflow: check_actionlint_installed
-	actionlint
-
 .PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -261,7 +213,7 @@ clippy: install_rs_check_toolchain
 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
 clippy_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
@@ -277,13 +229,13 @@ clippy_tasks:

 .PHONY: clippy_trivium # Run clippy lints on Trivium app
 clippy_trivium: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		-p tfhe-trivium -- --no-deps -D warnings

 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
@@ -292,24 +244,14 @@ clippy_concrete_csprng:
 		--features=$(TARGET_ARCH_FEATURE) \
 		-p concrete-csprng -- --no-deps -D warnings

-.PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
-clippy_zk_pok:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-zk-pok -- --no-deps -D warnings
-
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets clippy_c_api \
-clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
+clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_trivium

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
 clippy_concrete_csprng

-.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
-clippy_cuda_backend: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		-p tfhe-cuda-backend -- --no-deps -D warnings
-
 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
@@ -348,11 +290,6 @@ build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets

-.PHONY: build_tfhe_coverage # Build with test coverage enabled
-build_tfhe_coverage: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
-
 .PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
 symlink_c_libs_without_fingerprint:
 	@./scripts/symlink_c_libs_without_fingerprint.sh \
@@ -362,14 +299,14 @@ symlink_c_libs_without_fingerprint:
 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,$(FORWARD_COMPAT_FEATURE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,gpu \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,gpu \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

@@ -385,7 +322,7 @@ build_web_js_api: install_rs_build_toolchain install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api

 .PHONY: build_web_js_api_parallel # Build the js API targeting the web browser with parallelism support
 build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
@@ -393,7 +330,7 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
 	rustup component add rust-src --toolchain $(RS_CHECK_TOOLCHAIN) && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS) -C target-feature=+atomics,+bulk-memory,+mutable-globals" rustup run $(RS_CHECK_TOOLCHAIN) \
 		wasm-pack build --release --target=web \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok-experimental \
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api \
 		-Z build-std=panic_abort,std

 .PHONY: build_node_js_api # Build the js API targeting nodejs
@@ -401,7 +338,7 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
 	cd tfhe && \
 	RUSTFLAGS="$(WASM_RUSTFLAGS)" rustup run "$(RS_BUILD_TOOLCHAIN)" \
 		wasm-pack build --release --target=nodejs \
-		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,zk-pok-experimental
+		-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api

 .PHONY: build_concrete_csprng # Build concrete_csprng
 build_concrete_csprng: install_rs_build_toolchain
@@ -411,10 +348,10 @@ build_concrete_csprng: install_rs_build_toolchain
 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental -p $(TFHE_SPEC) -- core_crypto::
+		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok-experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
 	fi

 .PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -422,33 +359,25 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
 		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
+		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage \
 		-p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
 			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
-			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage,$(AVX512_FEATURE) \
+			-p $(TFHE_SPEC) -- core_crypto::; \
 	fi

-.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
-test_cuda_backend:
-	mkdir -p "$(TFHECUDA_BUILD)" && \
-		cd "$(TFHECUDA_BUILD)" && \
-		cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
-		make -j "$(CPU_COUNT)" && \
-		make test
-
 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
+test_gpu: test_core_crypto_gpu test_integer_gpu

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
@@ -467,8 +396,8 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
-		-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::
+		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
+		-p $(TFHE_SPEC) -- boolean::

 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
@@ -512,8 +441,8 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
-		-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::
+		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
+		-p $(TFHE_SPEC) -- shortint::

 .PHONY: test_integer_ci # Run the tests for integer ci
 test_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -573,45 +502,24 @@ test_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::

-.PHONY: test_integer_cov # Run the tests of the integer module with code coverage
-test_integer_cov: install_rs_check_toolchain install_tarpaulin
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
-		--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
-		--implicit-test-threads \
-		--exclude-files $(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
-		-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
-
 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
 		-- high_level_api::

-test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
-		-E "test(/high_level_api::.*gpu.*/)"
-
 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok-experimental \
-		-p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
 		-- test_user_docs::

 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok-experimental -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
 		-- test_user_docs::

-.PHONY: test_fhe_strings # Run tests for fhe_strings example
-test_fhe_strings: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--example fhe_strings \
-		--features=$(TARGET_ARCH_FEATURE),integer
-
 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -642,46 +550,33 @@ test_concrete_csprng:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng

-.PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
-test_zk_pok:
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe-zk-pok
-
 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
-	@# Even though we are not in docs.rs, this allows to "just" build the doc
-	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental --no-deps -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps -p $(TFHE_SPEC)

 .PHONY: docs # Build rust doc alias for doc
 docs: doc

 .PHONY: lint_doc # Build rust doc with linting enabled
 lint_doc: install_rs_check_toolchain
-	@# Even though we are not in docs.rs, this allows to "just" build the doc
-	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental -p $(TFHE_SPEC) --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc

 .PHONY: format_doc_latex # Format the documentation latex equations to avoid broken rendering.
 format_doc_latex:
-	RUSTFLAGS="" cargo xtask format_latex_doc
+	cargo xtask format_latex_doc
 	@"$(MAKE)" --no-print-directory fmt
 	@printf "\n===============================\n\n"
 	@printf "Please manually inspect changes made by format_latex_doc, rustfmt can break equations \
 	if the line length is exceeded\n"
 	@printf "\n===============================\n"

-.PHONY: check_md_docs_are_tested # Checks that the rust codeblocks in our .md files are tested
-check_md_docs_are_tested:
-	RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
-
 .PHONY: check_compile_tests # Build tests in debug without running them
 check_compile_tests:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
@@ -693,16 +588,6 @@ check_compile_tests:
 		./scripts/c_api_tests.sh --build-only; \
 	fi

-.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
-check_compile_tests_benches_gpu: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
-		-p $(TFHE_SPEC)
-	mkdir -p "$(TFHECUDA_BUILD)" && \
-		cd "$(TFHECUDA_BUILD)" && \
-		cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
-		make -j "$(CPU_COUNT)"
-
 .PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
 build_nodejs_test_docker:
 	DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
@@ -755,21 +640,21 @@ bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -777,7 +662,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -785,7 +670,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -793,25 +678,25 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
 bench_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)

 .PHONY: bench_oprf # Run benchmarks for shortint
 bench_oprf: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)



@@ -821,38 +706,26 @@ bench_shortint_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --


 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench boolean-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)

 .PHONY: bench_pbs # Run benchmarks for PBS
 bench_pbs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)

 .PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
 bench_pbs_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_ks # Run benchmarks for keyswitch
-bench_ks: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench ks-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
-bench_ks_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench ks-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)

 .PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
 bench_web_js_api_parallel: build_web_js_api_parallel
@@ -869,7 +742,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
 #
 .PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
 gen_key_cache: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 		--example generates_test_keys \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
 		$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
@@ -942,15 +815,13 @@ sha256_bool: install_rs_check_toolchain
 	--features=$(TARGET_ARCH_FEATURE),boolean

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
-pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_all \
-check_compile_tests
+pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
-pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
+pcc_gpu: pcc clippy_gpu

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
-fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_fast \
-check_compile_tests
+fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt
--- a/README.md
+++ b/README.md
@@ -1,71 +1,41 @@
 <p align="center">
 <!-- product name logo -->
-<picture>
-  <source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/5283e0ba-da1e-43af-9f2a-c5221367a12b">
-  <source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/b94a8c96-7595-400b-9311-70765c706955">
-  <img width=600 alt="Zama TFHE-rs">
-</picture>
+  <img width=600 src="https://user-images.githubusercontent.com/5758427/231206749-8f146b97-3c5a-4201-8388-3ffa88580415.png">
+</p>
+<hr/>
+<p align="center">
+  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources</a>
+</p>
+<p align="center">
+<!-- Version badge using shields.io -->
+  <a href="https://github.com/zama-ai/tfhe-rs/releases">
+    <img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
+  </a>
+  <!-- Link to tutorials badge using shields.io -->
+  <a href="#license">
+    <img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-orange?style=flat-square">
+  </a>
+<!-- Zama Bounty Program -->
+  <a href="https://github.com/zama-ai/bounty-program">
+    <img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
+  </a>
 </p>
-
 <hr/>

-<p align="center">
-  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
-</p>

+**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer
+arithmetics over encrypted data. It includes:
+ - a **Rust** API
+ - a **C** API
+ - and a **client-side WASM** API

-<p align="center">
-  <a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
-  <a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
-  <a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
-</p>
-
-## About
-
-### What is TFHE-rs
-
-**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer arithmetics over encrypted data.
-
-It includes:
- a **Rust** API
- a **C** API
- and a **client-side WASM** API
-
-TFHE-rs is designed for developers and researchers who want full control over
-what they can do with TFHE, while not having to worry about the low-level
+**TFHE-rs** is meant for developers and researchers who want full control over
+what they can do with TFHE, while not having to worry about the low level
 implementation. The goal is to have a stable, simple, high-performance, and
 production-ready library for all the advanced features of TFHE.
-<br></br>
-
-### Main features
-
- **Low-level cryptographic library** that implements Zama’s variant of TFHE, including programmable bootstrapping
- **Implementation of the original TFHE boolean API** that can be used as a drop-in replacement for other TFHE libraries
- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
- **Size-efficient public key encryption**
- **Ciphertext and server key compression** for efficient data transfer
- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
-
-*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
-<br></br>
-
-## Table of Contents
- **[Getting Started](#getting-started)**
-   - [Cargo.toml configuration](#cargotoml-configuration)
-   - [A simple example](#a-simple-example)
- **[Resources](#resources)**
-   - [TFHE deep dive](#tfhe-deep-dive)
-   - [Tutorials](#tutorials)
-   - [Documentation](#documentation)
- **[Working with TFHE-rs](#working-with-tfhe-rs)**
-   - [Disclaimers](#disclaimers)
-   - [Citations](#citations)
-   - [Contributing](#contributing)
-   - [License](#license)
- **[Support](#support)**
-<br></br>

 ## Getting Started
+The steps to run a first example are described below. 

 ### Cargo.toml configuration
 To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
@@ -81,24 +51,20 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-un
 ```toml
 tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
 ```
+Note: users with ARM devices must compile `TFHE-rs` using a stable toolchain with version >= 1.72.

-+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
+
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) 
+running Windows:

 ```toml
 tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
 ```

-> [!Note]
-> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.
+Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.

-> [!Note]
-> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.

-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
-
-### A simple example
+## A simple example

 Here is a full example:

@@ -131,13 +97,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Clear equivalent computations: 1344 * 5 = 6720
    let encrypted_res_mul = &encrypted_a * &encrypted_b;

-    // Clear equivalent computations: 6720 >> 5 = 210
+    // Clear equivalent computations: 1344 >> 5 = 42
    encrypted_a = &encrypted_res_mul >> &encrypted_b;

    // Clear equivalent computations: let casted_a = a as u8;
    let casted_a: FheUint8 = encrypted_a.cast_into();

-    // Clear equivalent computations: min(210, 7) = 7
+    // Clear equivalent computations: min(42, 7) = 7
    let encrypted_res_min = &casted_a.min(&encrypted_c);

    // Operation between clear and encrypted data:
@@ -155,70 +121,34 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 To run this code, use the following command: 
 <p align="center"> <code> cargo run --release </code> </p>

-> [!Note]
-> Note that when running code that uses `TFHE-rs`, it is highly recommended
+Note that when running code that uses `tfhe-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*

-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
+## Contributing
+
+There are two ways to contribute to TFHE-rs:
+
+- you can open issues to report bugs or typos, or to suggest new ideas
+- you can ask to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
+(becoming an approved contributor involves signing our Contributor License Agreement (CLA))
+
+Only approved contributors can send pull requests, so please make sure to get in touch before you do!
+
+## Credits
+
+This library uses several dependencies and we would like to thank the contributors of those
+libraries.
+
+## Need support?
+<a target="_blank" href="https://community.zama.ai">
+  <img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/33d856dc-f25d-454b-a010-af12bff2aa7d">
+</a>



-## Resources 
+## Citing TFHE-rs

-### TFHE deep dive
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
-<br></br>
-
-### Tutorials
- [[Video tutorial] Implement signed integers using TFHE-rs ](https://www.zama.ai/post/video-tutorial-implement-signed-integers-ssing-tfhe-rs)
- [Homomorphic parity bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
- [Homomorphic case changing on Ascii string](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
- [Dark market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
- [Regular expression engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
-
-*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
-<br></br>
-### Documentation
-
-Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
-
-
-## Working with TFHE-rs
-
-### Disclaimers
-
-#### Security Estimation
-
-Security estimations are done using the
-[Lattice Estimator](https://github.com/malb/lattice-estimator)
-with `red_cost_model = reduction.RC.BDGL16`.
-
-When a new update is published in the Lattice Estimator, we update parameters accordingly.
-
-### Security Model
-
-The default parameters for the TFHE-rs library are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at p_error = $2^{-40}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [1]. 
-
-[1] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf
-
-#### Side-Channel Attacks
-
-Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
-and will be released in upcoming versions.
-<br></br>
-
-### Citations
 To cite TFHE-rs in academic papers, please use the following entry:

 ```text
@@ -230,35 +160,22 @@ To cite TFHE-rs in academic papers, please use the following entry:
 }
 ```

-### Contributing
+## License

-There are two ways to contribute to TFHE-rs:
+This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
+please contact us at `hello@zama.ai`.

- [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
+## Disclaimers

-Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
-<br></br>
+### Security Estimation

-### License
-This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
+Security estimations are done using the
+[Lattice Estimator](https://github.com/malb/lattice-estimator)
+with `red_cost_model = reduction.RC.BDGL16`.

+When a new update is published in the Lattice Estimator, we update parameters accordingly.

-## Support
+### Side-Channel Attacks

-<a target="_blank" href="https://community.zama.ai">
-<picture>
-  <source media="(prefers-color-scheme: dark)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/08656d0a-3f44-4126-b8b6-8c601dff5380">
-  <source media="(prefers-color-scheme: light)" srcset="https://github.com/zama-ai/tfhe-rs/assets/157474013/1c9c9308-50ac-4aab-a4b9-469bb8c536a4">
-  <img alt="Support">
-</picture>
-</a>
-
-🌟 If you find this project helpful or interesting, please consider giving it a star on GitHub! Your support helps to grow the community and motivates further development. 
-
-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
+Mitigation for side channel attacks have not yet been implemented in TFHE-rs,
+and will be released in upcoming versions.
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -15,6 +15,7 @@ Example of a Rust main below:
 ```rust
 use tfhe::{ConfigBuilder, generate_keys, FheBool};
 use tfhe::prelude::*;
+
 use tfhe_trivium::TriviumStream;

 fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
@@ -138,8 +139,10 @@ Example code:
 ```rust
 use tfhe::shortint::prelude::*;
 use tfhe::shortint::CastingKey;
+
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
+
 use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
--- a/apps/trivium/benches/kreyvium_bool.rs
+++ b/apps/trivium/benches/kreyvium_bool.rs
@@ -1,8 +1,10 @@
-use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool};
+
 use tfhe_trivium::KreyviumStream;

+use criterion::Criterion;
+
 pub fn kreyvium_bool_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (client_key, server_key) = generate_keys(config);
--- a/apps/trivium/benches/kreyvium_byte.rs
+++ b/apps/trivium/benches/kreyvium_byte.rs
@@ -1,8 +1,10 @@
-use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
+
 use tfhe_trivium::{KreyviumStreamByte, TransCiphering};

+use criterion::Criterion;
+
 pub fn kreyvium_byte_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
        .enable_function_evaluation()
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,12 @@
-use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::prelude::*;
+use tfhe::shortint::KeySwitchingKey;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
+
 use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

+use criterion::Criterion;
+
 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
--- a/apps/trivium/benches/trivium_bool.rs
+++ b/apps/trivium/benches/trivium_bool.rs
@@ -1,8 +1,10 @@
-use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool};
+
 use tfhe_trivium::TriviumStream;

+use criterion::Criterion;
+
 pub fn trivium_bool_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (client_key, server_key) = generate_keys(config);
--- a/apps/trivium/benches/trivium_byte.rs
+++ b/apps/trivium/benches/trivium_byte.rs
@@ -1,8 +1,10 @@
-use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64, FheUint8};
+
 use tfhe_trivium::{TransCiphering, TriviumStreamByte};

+use criterion::Criterion;
+
 pub fn trivium_byte_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (client_key, server_key) = generate_keys(config);
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,12 @@
-use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::prelude::*;
+use tfhe::shortint::KeySwitchingKey;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
+
 use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

+use criterion::Criterion;
+
 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default().build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
--- a/apps/trivium/src/kreyvium/kreyvium.rs
+++ b/apps/trivium/src/kreyvium/kreyvium.rs
@@ -2,10 +2,12 @@
 //! for the representation of the inner bits.

 use crate::static_deque::StaticDeque;
-use rayon::prelude::*;
+
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheBool, ServerKey};

+use rayon::prelude::*;
+
 /// Internal trait specifying which operations are necessary for KreyviumStream generic type
 pub trait KreyviumBoolInput<OpOutput>:
    Sized
--- a/apps/trivium/src/kreyvium/kreyvium_byte.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_byte.rs
@@ -2,10 +2,12 @@
 //! for the representation of the inner bits.

 use crate::static_deque::{StaticByteDeque, StaticByteDequeInput};
-use rayon::prelude::*;
+
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheUint8, ServerKey};

+use rayon::prelude::*;
+
 /// Internal trait specifying which operations are necessary for KreyviumStreamByte generic type
 pub trait KreyviumByteInput<OpOutput>:
    Sized
--- a/apps/trivium/src/kreyvium/kreyvium_shortint.rs
+++ b/apps/trivium/src/kreyvium/kreyvium_shortint.rs
@@ -1,7 +1,9 @@
 use crate::static_deque::StaticDeque;
-use rayon::prelude::*;
+
 use tfhe::shortint::prelude::*;

+use rayon::prelude::*;
+
 /// KreyviumStreamShortint: a struct implementing the Kreyvium stream cipher, using a generic
 /// Ciphertext for the internal representation of bits (intended to represent a single bit). To be
 /// able to compute FHE operations, it also owns a ServerKey.
@@ -34,7 +36,7 @@ impl KreyviumStreamShortint {
        let mut c_register: [Ciphertext; 111] = [0; 111].map(|x| sk.create_trivial(x));

        for i in 0..93 {
-            a_register[i].clone_from(&key[128 - 93 + i]);
+            a_register[i] = key[128 - 93 + i].clone();
        }
        for i in 0..84 {
            b_register[i] = sk.create_trivial(iv[128 - 84 + i]);
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,7 +1,8 @@
-use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};

+use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
+
 // Values for these tests come from the github repo renaud1239/Kreyvium,
 // commit fd6828f68711276c25f55e605935028f5e843f43

--- a/apps/trivium/src/static_deque/mod.rs
+++ b/apps/trivium/src/static_deque/mod.rs
@@ -1,6 +1,5 @@
 #[allow(clippy::module_inception)]
 mod static_deque;
 pub use static_deque::StaticDeque;
-
 mod static_byte_deque;
 pub use static_byte_deque::{StaticByteDeque, StaticByteDequeInput};
--- a/apps/trivium/src/static_deque/static_byte_deque.rs
+++ b/apps/trivium/src/static_deque/static_byte_deque.rs
@@ -4,6 +4,7 @@
 //! This is pretending to store bits, and allows accessing bits in chunks of 8 consecutive.

 use crate::static_deque::StaticDeque;
+
 use tfhe::FheUint8;

 /// Internal trait specifying which operations are needed by StaticByteDeque
--- a/apps/trivium/src/trans_ciphering/mod.rs
+++ b/apps/trivium/src/trans_ciphering/mod.rs
@@ -2,11 +2,13 @@
 //! when trans ciphering is available to them.

 use crate::{KreyviumStreamByte, KreyviumStreamShortint, TriviumStreamByte, TriviumStreamShortint};
-use rayon::prelude::*;
-use tfhe::prelude::*;
 use tfhe::shortint::Ciphertext;
+
+use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheUint64, FheUint8, ServerKey};

+use rayon::prelude::*;
+
 /// Triat specifying the interface for trans ciphering a FheUint64 object. Since it is meant
 /// to be used with stream ciphers, encryption and decryption are by default the same.
 pub trait TransCiphering {
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,7 +1,8 @@
-use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};

+use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
+
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
 // file testvectors/trivium-80.80.test-vectors

--- a/apps/trivium/src/trivium/trivium_bool.rs
+++ b/apps/trivium/src/trivium/trivium_bool.rs
@@ -2,10 +2,12 @@
 //! for the representation of the inner bits.

 use crate::static_deque::StaticDeque;
-use rayon::prelude::*;
+
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheBool, ServerKey};

+use rayon::prelude::*;
+
 /// Internal trait specifying which operations are necessary for TriviumStream generic type
 pub trait TriviumBoolInput<OpOutput>:
    Sized
--- a/apps/trivium/src/trivium/trivium_byte.rs
+++ b/apps/trivium/src/trivium/trivium_byte.rs
@@ -2,10 +2,12 @@
 //! for the representation of the inner bits.

 use crate::static_deque::{StaticByteDeque, StaticByteDequeInput};
-use rayon::prelude::*;
+
 use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheUint8, ServerKey};

+use rayon::prelude::*;
+
 /// Internal trait specifying which operations are necessary for TriviumStreamByte generic type
 pub trait TriviumByteInput<OpOutput>:
    Sized
--- a/apps/trivium/src/trivium/trivium_shortint.rs
+++ b/apps/trivium/src/trivium/trivium_shortint.rs
@@ -1,7 +1,9 @@
 use crate::static_deque::StaticDeque;
-use rayon::prelude::*;
+
 use tfhe::shortint::prelude::*;

+use rayon::prelude::*;
+
 /// TriviumStreamShortint: a struct implementing the Trivium stream cipher, using a generic
 /// Ciphertext for the internal representation of bits (intended to represent a single bit). To be
 /// able to compute FHE operations, it also owns a ServerKey.
@@ -32,7 +34,7 @@ impl TriviumStreamShortint {
        let mut c_register: [Ciphertext; 111] = [0; 111].map(|x| sk.create_trivial(x));

        for i in 0..80 {
-            a_register[93 - 80 + i].clone_from(&key[i]);
+            a_register[93 - 80 + i] = key[i].clone();
            b_register[84 - 80 + i] = sk.create_trivial(iv[i]);
        }

--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.2.0"
+version = "0.1.3"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -30,7 +30,8 @@ The cryptographic operations it provides are:

 ## Build

-The Cuda project held in `tfhe-cuda-backend` can be compiled independently from TFHE-rs in the following way:
+The Cuda project held in `tfhe-cuda-backend` can be compiled independently from Concrete in the 
+following way:
 ```
 git clone git@github.com:zama-ai/tfhe-rs
 cd backends/tfhe-cuda-backend/cuda
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -2,12 +2,6 @@ use std::env;
 use std::process::Command;

 fn main() {
-    if let Ok(val) = env::var("DOCS_RS") {
-        if val.parse::<u32>() == Ok(1) {
-            return;
-        }
-    }
-
    println!("Build tfhe-cuda-backend");
    if env::consts::OS == "linux" {
        let output = Command::new("./get_os_name.sh").output().unwrap();
--- a/backends/tfhe-cuda-backend/cuda/.gitignore
+++ b/backends/tfhe-cuda-backend/cuda/.gitignore
@@ -1,2 +0,0 @@
-/build/
-include/cuda_config.h
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -58,15 +58,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
 if(${CUDA_SUCCESS})
  set(CMAKE_CUDA_ARCHITECTURES native)
-  string(REPLACE "-arch=sm_" "" CUDA_ARCH "${ARCH}")
-  set(CUDA_ARCH "${CUDA_ARCH}0")
 else()
  set(CMAKE_CUDA_ARCHITECTURES 70)
-  set(CUDA_ARCH "700")
 endif()

-add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
-
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
@@ -76,13 +71,10 @@ set(CMAKE_CUDA_FLAGS
 set(INCLUDE_DIR include)

 add_subdirectory(src)
-enable_testing()
-add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})

 # This is required for rust cargo build
 install(TARGETS tfhe_cuda_backend DESTINATION .)
-
 install(TARGETS tfhe_cuda_backend DESTINATION lib)

 # Define a function to add a lint target.
@@ -94,3 +86,5 @@ if(CPPLINT)
  set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
  # set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
 endif()
+
+enable_testing()
--- a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
+++ b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
@@ -1,19 +1,6 @@
 #!/bin/bash

-set -e
-
-while getopts ":c" option; do
-  case $option in
-    c)
-      # code to execute when flag1 is provided
-      find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
-      cmake-format -i CMakeLists.txt -c .cmake-format-config.py
-      find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
-      git diff --exit-code
-      exit
-      ;;
-  esac
-done
-find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
+find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
 cmake-format -i CMakeLists.txt -c .cmake-format-config.py
-find ./{include,src,tests_and_benchmarks/include,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
+
+find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
@@ -0,0 +1,118 @@
+#ifndef CUDA_BOOTSTRAP_H
+#define CUDA_BOOTSTRAP_H
+
+#include "device.h"
+#include <cstdint>
+
+enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
+
+extern "C" {
+void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
+                                 cuda_stream_t *stream,
+                                 uint32_t polynomial_size,
+                                 uint32_t total_polynomials);
+
+void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size);
+
+void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size);
+
+void scratch_cuda_bootstrap_amortized_32(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory);
+
+void scratch_cuda_bootstrap_amortized_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory);
+
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
+                                      int8_t **pbs_buffer);
+
+void scratch_cuda_bootstrap_low_latency_32(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void scratch_cuda_bootstrap_low_latency_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
+                                        int8_t **pbs_buffer);
+
+uint64_t get_buffer_size_bootstrap_amortized_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+
+uint64_t get_buffer_size_bootstrap_low_latency_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+}
+
+#ifdef __CUDACC__
+__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
+                                         int glwe_dimension,
+                                         uint32_t level_count);
+
+template <typename T>
+__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count);
+
+template <typename T>
+__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count);
+
+template <typename T>
+__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+
+#endif
+
+#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
@@ -0,0 +1,46 @@
+#ifndef CUDA_MULTI_BIT_H
+#define CUDA_MULTI_BIT_H
+
+#include <cstdint>
+
+extern "C" {
+void cuda_convert_lwe_multi_bit_bootstrap_key_64(
+    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
+    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
+    uint32_t grouping_factor);
+
+void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t chunk_size = 0);
+
+void scratch_cuda_multi_bit_pbs_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory,
+    uint32_t chunk_size = 0);
+
+void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
+}
+#ifdef __CUDACC__
+__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
+                                     uint32_t level_count,
+                                     uint32_t glwe_dimension,
+                                     uint32_t num_samples);
+
+__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
+                                             uint32_t level_count,
+                                             uint32_t glwe_dimension,
+                                             uint32_t ct_count);
+
+__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
+#endif
+
+#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -11,22 +11,6 @@

 extern "C" {

-#define check_cuda_error(ans)                                                  \
-  { cuda_error((ans), __FILE__, __LINE__); }
-inline void cuda_error(cudaError_t code, const char *file, int line) {
-  if (code != cudaSuccess) {
-    std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
-                 file, line);
-    std::abort();
-  }
-}
-#define PANIC(format, ...)                                                     \
-  {                                                                            \
-    std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
-                 __LINE__, __func__, ##__VA_ARGS__);                           \
-    std::abort();                                                              \
-  }
-
 struct cuda_stream_t {
  cudaStream_t stream;
  uint32_t gpu_index;
@@ -34,58 +18,68 @@ struct cuda_stream_t {
  cuda_stream_t(uint32_t gpu_index) {
    this->gpu_index = gpu_index;

-    check_cuda_error(cudaStreamCreate(&stream));
+    cudaStreamCreate(&stream);
  }

  void release() {
-    check_cuda_error(cudaSetDevice(gpu_index));
-    check_cuda_error(cudaStreamDestroy(stream));
+    cudaSetDevice(gpu_index);
+    cudaStreamDestroy(stream);
  }

-  void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
+  void synchronize() { cudaStreamSynchronize(stream); }
 };

 cuda_stream_t *cuda_create_stream(uint32_t gpu_index);

-void cuda_destroy_stream(cuda_stream_t *stream);
+int cuda_destroy_stream(cuda_stream_t *stream);

 void *cuda_malloc(uint64_t size, uint32_t gpu_index);

 void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);

-void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
+int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

-bool cuda_check_support_cooperative_groups();
+int cuda_check_support_cooperative_groups();

-void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                              cuda_stream_t *stream);
+int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);

-void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                  cuda_stream_t *stream);
+int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+                             cuda_stream_t *stream);

-void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                              cuda_stream_t *stream);
+int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                                 cuda_stream_t *stream);

-void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                       cuda_stream_t *stream);
+int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
+
+int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                             cuda_stream_t *stream);
+
+int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+                      cuda_stream_t *stream);

 int cuda_get_number_of_gpus();

-void cuda_synchronize_device(uint32_t gpu_index);
+int cuda_synchronize_device(uint32_t gpu_index);

-void cuda_drop(void *ptr, uint32_t gpu_index);
+int cuda_drop(void *ptr, uint32_t gpu_index);

-void cuda_drop_async(void *ptr, cuda_stream_t *stream);
+int cuda_drop_async(void *ptr, cuda_stream_t *stream);

 int cuda_get_max_shared_memory(uint32_t gpu_index);

-void cuda_synchronize_stream(cuda_stream_t *stream);
+int cuda_synchronize_stream(cuda_stream_t *stream);

-void cuda_stream_add_callback(cuda_stream_t *stream,
-                              cudaStreamCallback_t callback, void *user_data);
-
-void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
-                                  void *host_pointer);
+#define check_cuda_error(ans)                                                  \
+  { cuda_error((ans), __FILE__, __LINE__); }
+inline void cuda_error(cudaError_t code, const char *file, int line,
+                       bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
+}
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -1,7 +1,7 @@
 #ifndef CUDA_LINALG_H_
 #define CUDA_LINALG_H_

-#include "programmable_bootstrap.h"
+#include "bootstrap.h"
 #include <cstdint>
 #include <device.h>

--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -1,320 +0,0 @@
-#ifndef CUDA_BOOTSTRAP_H
-#define CUDA_BOOTSTRAP_H
-
-#include "device.h"
-#include <cstdint>
-
-enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
-enum PBS_VARIANT { DEFAULT = 0, CG = 1 };
-
-extern "C" {
-void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
-                                 cuda_stream_t *stream,
-                                 uint32_t polynomial_size,
-                                 uint32_t total_polynomials);
-
-void cuda_convert_lwe_programmable_bootstrap_key_32(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
-
-void cuda_convert_lwe_programmable_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size);
-
-void scratch_cuda_programmable_bootstrap_amortized_32(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void scratch_cuda_programmable_bootstrap_amortized_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cleanup_cuda_programmable_bootstrap_amortized(cuda_stream_t *stream,
-                                                   int8_t **pbs_buffer);
-
-void scratch_cuda_programmable_bootstrap_32(
-    cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_programmable_bootstrap_64(
-    cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cleanup_cuda_programmable_bootstrap(cuda_stream_t *stream,
-                                         int8_t **pbs_buffer);
-
-uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-
-uint64_t get_buffer_size_programmable_bootstrap_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_programmable_bootstrap_step_one(
-    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_programmable_bootstrap_step_two(
-    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
-}
-
-template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
-
-template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
-  int8_t *d_mem;
-
-  Torus *global_accumulator;
-  double2 *global_accumulator_fft;
-
-  PBS_VARIANT pbs_variant;
-
-  pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
-             uint32_t polynomial_size, uint32_t level_count,
-             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
-             bool allocate_gpu_memory) {
-    this->pbs_variant = pbs_variant;
-
-    auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
-
-    if (allocate_gpu_memory) {
-      switch (pbs_variant) {
-      case PBS_VARIANT::DEFAULT: {
-        uint64_t full_sm_step_one =
-            get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
-                polynomial_size);
-        uint64_t full_sm_step_two =
-            get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
-                polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap<Torus>(
-                polynomial_size);
-
-        uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
-        uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
-        uint64_t full_dm = full_sm_step_one;
-
-        uint64_t device_mem = 0;
-        if (max_shared_memory < partial_sm) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm_step_two) {
-          device_mem =
-              (partial_dm_step_two + partial_dm_step_one * level_count) *
-              input_lwe_ciphertext_count * (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm_step_one) {
-          device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
-                       level_count * (glwe_dimension + 1);
-        }
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
-
-        global_accumulator_fft = (double2 *)cuda_malloc_async(
-            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
-                (polynomial_size / 2) * sizeof(double2),
-            stream);
-
-        global_accumulator = (Torus *)cuda_malloc_async(
-            (glwe_dimension + 1) * input_lwe_ciphertext_count *
-                polynomial_size * sizeof(Torus),
-            stream);
-      } break;
-      case PBS_VARIANT::CG: {
-        uint64_t full_sm =
-            get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(
-                polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
-                polynomial_size);
-
-        uint64_t partial_dm = full_sm - partial_sm;
-        uint64_t full_dm = full_sm;
-        uint64_t device_mem = 0;
-
-        if (max_shared_memory < partial_sm) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm) {
-          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        }
-
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
-
-        global_accumulator_fft = (double2 *)cuda_malloc_async(
-            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
-                polynomial_size / 2 * sizeof(double2),
-            stream);
-      } break;
-      default:
-        PANIC("Cuda error (PBS): unsupported implementation variant.")
-      }
-    }
-  }
-
-  void release(cuda_stream_t *stream) {
-    cuda_drop_async(d_mem, stream);
-    cuda_drop_async(global_accumulator_fft, stream);
-
-    if (pbs_variant == DEFAULT)
-      cuda_drop_async(global_accumulator, stream);
-  }
-};
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  uint64_t full_sm =
-      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
-          polynomial_size);
-  uint64_t partial_dm = full_sm - partial_sm;
-  uint64_t full_dm = full_sm;
-  uint64_t device_mem = 0;
-  if (max_shared_memory < partial_sm) {
-    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                 (glwe_dimension + 1);
-  } else if (max_shared_memory < full_sm) {
-    device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                 (glwe_dimension + 1);
-  }
-  uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
-                                          input_lwe_ciphertext_count *
-                                          polynomial_size / 2 * sizeof(double2);
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus>
-bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
-                                                   uint32_t polynomial_size,
-                                                   uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   uint32_t max_shared_memory);
-
-template <typename Torus>
-void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key,
-    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
-
-template <typename Torus>
-void cuda_programmable_bootstrap_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key,
-    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
-
-template <typename Torus, typename STorus>
-void scratch_cuda_programmable_bootstrap_cg(
-    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-template <typename Torus, typename STorus>
-void scratch_cuda_programmable_bootstrap(
-    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-#ifdef __CUDACC__
-__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
-                                         int glwe_dimension,
-                                         uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
-
-#endif
-
-#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -1,241 +0,0 @@
-#ifndef CUDA_MULTI_BIT_H
-#define CUDA_MULTI_BIT_H
-
-#include "programmable_bootstrap.h"
-#include <cstdint>
-
-extern "C" {
-
-bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
-
-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
-    uint32_t grouping_factor);
-
-void scratch_cuda_multi_bit_programmable_bootstrap_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t chunk_size = 0);
-
-void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
-
-void scratch_cuda_generic_multi_bit_programmable_bootstrap_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0);
-
-void cuda_generic_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
-                                                   int8_t **pbs_buffer);
-}
-
-template <typename Torus, typename STorus>
-void scratch_cuda_cg_multi_bit_programmable_bootstrap(
-    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
-
-template <typename Torus>
-void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, Torus *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
-
-template <typename Torus, typename STorus>
-void scratch_cuda_multi_bit_programmable_bootstrap(
-    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
-
-template <typename Torus>
-void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, Torus *bootstrapping_key,
-    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
-    uint32_t polynomial_size);
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one(
-    uint32_t polynomial_size);
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
-    uint32_t polynomial_size);
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one(
-    uint32_t polynomial_size);
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_cg_multibit_programmable_bootstrap(
-    uint32_t polynomial_size);
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
-    uint32_t polynomial_size);
-
-template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
-  int8_t *d_mem_keybundle = NULL;
-  int8_t *d_mem_acc_step_one = NULL;
-  int8_t *d_mem_acc_step_two = NULL;
-  int8_t *d_mem_acc_cg = NULL;
-
-  double2 *keybundle_fft;
-  Torus *global_accumulator;
-  double2 *global_accumulator_fft;
-
-  PBS_VARIANT pbs_variant;
-
-  pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
-             uint32_t polynomial_size, uint32_t level_count,
-             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
-             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
-    this->pbs_variant = pbs_variant;
-    auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
-
-    uint64_t full_sm_keybundle =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
-            Torus>(polynomial_size);
-    uint64_t full_sm_accumulate_step_one =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
-            polynomial_size);
-    uint64_t partial_sm_accumulate_step_one =
-        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
-            Torus>(polynomial_size);
-    uint64_t full_sm_accumulate_step_two =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
-            polynomial_size);
-    uint64_t full_sm_cg_accumulate =
-        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
-            polynomial_size);
-    uint64_t partial_sm_cg_accumulate =
-        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
-            polynomial_size);
-
-    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
-                                (glwe_dimension + 1) * (glwe_dimension + 1) *
-                                level_count;
-    auto num_blocks_acc_step_one =
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
-    auto num_blocks_acc_step_two =
-        input_lwe_ciphertext_count * (glwe_dimension + 1);
-    auto num_blocks_acc_cg =
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
-
-    if (allocate_gpu_memory) {
-      // Keybundle
-      if (max_shared_memory < full_sm_keybundle)
-        d_mem_keybundle = (int8_t *)cuda_malloc_async(
-            num_blocks_keybundle * full_sm_keybundle, stream);
-
-      switch (pbs_variant) {
-      case DEFAULT:
-        // Accumulator step one
-        if (max_shared_memory < partial_sm_accumulate_step_one)
-          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_one * full_sm_accumulate_step_one, stream);
-        else if (max_shared_memory < full_sm_accumulate_step_one)
-          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream);
-
-        // Accumulator step two
-        if (max_shared_memory < full_sm_accumulate_step_two)
-          d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_two * full_sm_accumulate_step_two, stream);
-        break;
-      case CG:
-        // Accumulator CG
-        if (max_shared_memory < partial_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * full_sm_cg_accumulate, stream);
-        else if (max_shared_memory < full_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
-        break;
-      default:
-        PANIC("Cuda error (PBS): unsupported implementation variant.")
-      }
-
-      keybundle_fft = (double2 *)cuda_malloc_async(
-          num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
-          stream);
-      global_accumulator = (Torus *)cuda_malloc_async(
-          num_blocks_acc_step_two * polynomial_size * sizeof(Torus), stream);
-      global_accumulator_fft = (double2 *)cuda_malloc_async(
-          num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
-          stream);
-    }
-  }
-
-  void release(cuda_stream_t *stream) {
-
-    if (d_mem_keybundle)
-      cuda_drop_async(d_mem_keybundle, stream);
-    switch (pbs_variant) {
-    case DEFAULT:
-      if (d_mem_acc_step_one)
-        cuda_drop_async(d_mem_acc_step_one, stream);
-      if (d_mem_acc_step_two)
-        cuda_drop_async(d_mem_acc_step_two, stream);
-      break;
-    case CG:
-      if (d_mem_acc_cg)
-        cuda_drop_async(d_mem_acc_cg, stream);
-      break;
-    default:
-      PANIC("Cuda error (PBS): unsupported implementation variant.")
-    }
-
-    cuda_drop_async(keybundle_fft, stream);
-    cuda_drop_async(global_accumulator, stream);
-    cuda_drop_async(global_accumulator_fft, stream);
-  }
-};
-
-#ifdef __CUDACC__
-
-__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count);
-
-#endif
-
-#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -106,23 +106,23 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
  cudaSetDevice(stream->gpu_index);
  constexpr int ideal_threads = 128;

-  int lwe_size = lwe_dimension_out + 1;
+  int lwe_dim = lwe_dimension_out + 1;
  int lwe_lower, lwe_upper, cutoff;
-  if (lwe_size % ideal_threads == 0) {
-    lwe_lower = lwe_size / ideal_threads;
-    lwe_upper = lwe_size / ideal_threads;
+  if (lwe_dim % ideal_threads == 0) {
+    lwe_lower = lwe_dim / ideal_threads;
+    lwe_upper = lwe_dim / ideal_threads;
    cutoff = 0;
  } else {
-    int y = ceil((double)lwe_size / (double)ideal_threads) * ideal_threads -
-            lwe_size;
+    int y =
+        ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
    cutoff = ideal_threads - y;
-    lwe_lower = lwe_size / ideal_threads;
-    lwe_upper = (int)ceil((double)lwe_size / (double)ideal_threads);
+    lwe_lower = lwe_dim / ideal_threads;
+    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
  }

-  int lwe_size_after = lwe_size * num_samples;
+  int lwe_size_after = (lwe_dimension_out + 1) * num_samples;

-  int shared_mem = sizeof(Torus) * lwe_size;
+  int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);

  cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
  check_cuda_error(cudaGetLastError());
@@ -130,7 +130,11 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
  dim3 grid(num_samples, 1, 1);
  dim3 threads(ideal_threads, 1, 1);

-  keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
+  //    cudaFuncSetAttribute(keyswitch<Torus>,
+  //                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+  //                         shared_mem);
+
+  keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
      lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
      lwe_upper, cutoff);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -4,21 +4,25 @@

 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cudaSetDevice(gpu_index);
  cuda_stream_t *stream = new cuda_stream_t(gpu_index);
  return stream;
 }

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
-void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
+int cuda_destroy_stream(cuda_stream_t *stream) {
+  stream->release();
+  return 0;
+}

 /// Unsafe function that will try to allocate even if gpu_index is invalid
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  cudaSetDevice(gpu_index);
  void *ptr;
-  check_cuda_error(cudaMalloc((void **)&ptr, size));
+  cudaMalloc((void **)&ptr, size);
+  check_cuda_error(cudaGetLastError());

  return ptr;
 }
@@ -26,7 +30,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  cudaSetDevice(stream->gpu_index);
  void *ptr;

 #ifndef CUDART_VERSION
@@ -48,88 +52,184 @@ void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
  return ptr;
 }

-/// Check that allocation is valid
-void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  size_t total_mem, free_mem;
-  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
-  if (size > free_mem) {
-    PANIC("Cuda error: not enough memory on device. "
-          "Available: %zu vs Requested: %lu",
-          free_mem, size)
+/// Checks that allocation is valid
+/// 0: valid
+/// -1: invalid, not enough memory in device
+/// -2: invalid, gpu index doesn't exist
+int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
+
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
  }
+  cudaSetDevice(gpu_index);
+  size_t total_mem, free_mem;
+  cudaMemGetInfo(&free_mem, &total_mem);
+  if (size > free_mem) {
+    // error code: not enough memory
+    return -1;
+  }
+  return 0;
 }

 /// Returns
-///  false if Cooperative Groups is not supported.
-///  true otherwise
-bool cuda_check_support_cooperative_groups() {
+///  -> 0 if Cooperative Groups is not supported.
+///  -> 1 otherwise
+int cuda_check_support_cooperative_groups() {
  int cooperative_groups_supported = 0;
-  check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
-                                          cudaDevAttrCooperativeLaunch, 0));
+  cudaDeviceGetAttribute(&cooperative_groups_supported,
+                         cudaDevAttrCooperativeLaunch, 0);

  return cooperative_groups_supported > 0;
 }

-/// Copy memory to the GPU asynchronously
-void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                              cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
+/// Tries to copy memory to the GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+                             cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
  }

-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dest);
+  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  cudaSetDevice(stream->gpu_index);
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
+  return 0;
 }

-/// Copy memory within a GPU asynchronously
-void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                  cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr_dest;
-  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.device != stream->gpu_index &&
-      attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
-  cudaPointerAttributes attr_src;
-  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.device != stream->gpu_index &&
-      attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
-  if (attr_src.device != attr_dest.device) {
-    PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
+/// Tries to copy memory to the GPU synchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
  }

-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dest);
+  if (attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
+  return 0;
+}
+
+/// Tries to copy memory to the CPU synchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, src);
+  if (attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
+  return 0;
+}
+
+/// Tries to copy memory within a GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                                 cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr_dest;
+  cudaPointerGetAttributes(&attr_dest, dest);
+  if (attr_dest.device != stream->gpu_index &&
+      attr_dest.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  cudaPointerAttributes attr_src;
+  cudaPointerGetAttributes(&attr_src, src);
+  if (attr_src.device != stream->gpu_index &&
+      attr_src.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  if (attr_src.device != attr_dest.device) {
+    // error code: different devices
+    return -1;
+  }
+
+  cudaSetDevice(stream->gpu_index);
  check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
                                   stream->stream));
+  return 0;
 }

 /// Synchronizes device
-void cuda_synchronize_device(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  check_cuda_error(cudaDeviceSynchronize());
+/// 0: success
+/// -2: error, gpu index doesn't exist
+int cuda_synchronize_device(uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  cudaDeviceSynchronize();
+  return 0;
 }

-void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                       cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
+int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+                      cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
  }
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dest);
+  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  cudaSetDevice(stream->gpu_index);
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
+  return 0;
 }

 template <typename Torus>
@@ -142,18 +242,12 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
 template <typename Torus>
 void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
                          Torus n) {
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
-  if (attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda set value.")
-  }
  int block_size = 256;
  int num_blocks = (n + block_size - 1) / block_size;

  // Launch the kernel
  cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
                                                                n);
-  check_cuda_error(cudaGetLastError());
 }

 /// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
@@ -162,39 +256,57 @@ template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
 template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
                                   uint32_t value, uint32_t n);

-/// Copy memory to the CPU asynchronously
-void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                              cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
+/// Tries to copy memory to the GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                             cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
  }

-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, src);
+  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  cudaSetDevice(stream->gpu_index);
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
+  return 0;
 }

 /// Return number of GPUs available
 int cuda_get_number_of_gpus() {
  int num_gpus;
-  check_cuda_error(cudaGetDeviceCount(&num_gpus));
+  cudaGetDeviceCount(&num_gpus);
  return num_gpus;
 }

 /// Drop a cuda array
-void cuda_drop(void *ptr, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+int cuda_drop(void *ptr, uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
  check_cuda_error(cudaFree(ptr));
+  return 0;
 }

-/// Drop a cuda array asynchronously, if supported on the device
-void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
+/// Drop a cuda array. Tries to do it asynchronously
+int cuda_drop_async(void *ptr, cuda_stream_t *stream) {

-  check_cuda_error(cudaSetDevice(stream->gpu_index));
+  cudaSetDevice(stream->gpu_index);
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
@@ -211,28 +323,28 @@ void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
 #else
  check_cuda_error(cudaFree(ptr));
 #endif
+  return 0;
 }

 /// Get the maximum size for the shared memory
 int cuda_get_max_shared_memory(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, gpu_index);
  int max_shared_memory = 0;
-  cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
-                         gpu_index);
-  check_cuda_error(cudaGetLastError());
+  if (prop.major >= 6) {
+    max_shared_memory = prop.sharedMemPerMultiprocessor;
+  } else {
+    max_shared_memory = prop.sharedMemPerBlock;
+  }
  return max_shared_memory;
 }

-void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
-
-void cuda_stream_add_callback(cuda_stream_t *stream,
-                              cudaStreamCallback_t callback, void *user_data) {
-
-  check_cuda_error(
-      cudaStreamAddCallback(stream->stream, callback, user_data, 0));
-}
-
-void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
-                                  void *host_pointer) {
-  free(host_pointer);
+int cuda_synchronize_stream(cuda_stream_t *stream) {
+  stream->synchronize();
+  return 0;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -181,7 +181,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
  // from level 8, we need to check size of params degree, because we support
  // minimum actual polynomial size = 256,  when compressed size is halfed and
  // minimum supported compressed size is 128, so we always need first 7
-  // levels of butterfly operation, since butterfly levels are hardcoded
+  // levels of butterfy operation, since butterfly levels are hardcoded
  // we need to check if polynomial size is big enough to require specific level
  // of butterfly.
  if constexpr (params::degree >= 256) {
@@ -353,7 +353,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {

  // compressed size = 8192 is actual polynomial size = 16384.
  // twiddles for this size can't fit in constant memory so
-  // butterfly operation for this level access device memory to fetch
+  // butterfly operation for this level acess device memory to fetch
  // twiddles
  if constexpr (params::degree >= 8192) {
    // level 13
@@ -484,7 +484,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  // below level 8, we don't need to check size of params degree, because we
  // support minimum actual polynomial size = 256,  when compressed size is
  // halfed and minimum supported compressed size is 128, so we always need
-  // last 7 levels of butterfly operation, since butterfly levels are hardcoded
+  // last 7 levels of butterfy operation, since butterfly levels are hardcoded
  // we don't need to check if polynomial size is big enough to require
  // specific level of butterfly.
  // level 7
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
@@ -3,7 +3,7 @@

 /*
 * 'negtwiddles' are stored in constant memory for faster access times
- * because of it's limited size, only twiddles for up to 2^12 polynomial size
+ * because of it's limitied size, only twiddles for up to 2^12 polynomial size
 * can be stored there, twiddles for 2^13 are stored in device memory
 * 'negtwiddles13'
 */
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -5,8 +5,8 @@
 #include "device.h"
 #include "integer.cuh"
 #include "integer.h"
-#include "pbs/programmable_bootstrap_classic.cuh"
-#include "pbs/programmable_bootstrap_multibit.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
 #include "polynomial/functions.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <omp.h>
@@ -44,7 +44,6 @@ __host__ void scratch_cuda_integer_radix_bitop_kb(
    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
    bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  *mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
                                         allocate_gpu_memory);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -10,7 +10,6 @@ __host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
                          int_radix_lut<Torus> *predicate, void *bsk,
                          Torus *ksk, uint32_t num_radix_blocks) {
-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;

  int big_lwe_size = params.big_lwe_dimension + 1;
@@ -29,8 +28,8 @@ __host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,

    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
                                   stream->stream>>>(
-        lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
-        lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
+        lwe_array_out_block, lwe_array_input_block, lwe_condition,
+        predicate->lwe_indexes, params.big_lwe_dimension,
        params.message_modulus, 1);
    check_cuda_error(cudaGetLastError());
  }
@@ -95,7 +94,6 @@ __host__ void scratch_cuda_integer_radix_cmux_kb(
    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
    int_radix_params params, bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  *mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
                                        num_radix_blocks, allocate_gpu_memory);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -5,8 +5,8 @@ void scratch_cuda_integer_radix_comparison_kb_64(
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, COMPARISON_TYPE op_type, bool is_signed,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
@@ -17,9 +17,9 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  switch (op_type) {
  case EQ:
  case NE:
-    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
-        params, op_type, false, allocate_gpu_memory);
+    scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
+        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
+        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
    break;
  case GT:
  case GE:
@@ -27,9 +27,9 @@ void scratch_cuda_integer_radix_comparison_kb_64(
  case LE:
  case MAX:
  case MIN:
-    scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks,
-        params, op_type, is_signed, allocate_gpu_memory);
+    scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
+        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
+        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
    break;
  }
 }
@@ -37,7 +37,7 @@ void scratch_cuda_integer_radix_comparison_kb_64(
 void cuda_comparison_integer_radix_ciphertext_kb_64(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t num_radix_blocks) {
+    uint32_t lwe_ciphertext_count) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
@@ -48,7 +48,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
        stream, static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), num_radix_blocks);
+        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
    break;
  case GT:
  case GE:
@@ -59,7 +59,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer,
        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
-        num_radix_blocks);
+        lwe_ciphertext_count);
    break;
  case MAX:
  case MIN:
@@ -67,10 +67,10 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
        stream, static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_array_1),
        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), num_radix_blocks);
+        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
    break;
  default:
-    PANIC("Cuda error: integer operation not supported")
+    printf("Not implemented\n");
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -8,8 +8,8 @@
 #include "integer/cmux.cuh"
 #include "integer/negation.cuh"
 #include "integer/scalar_addition.cuh"
-#include "pbs/programmable_bootstrap_classic.cuh"
-#include "pbs/programmable_bootstrap_multibit.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
 #include "types/complex/operations.cuh"
 #include "utils/kernel_dimensions.cuh"

@@ -37,7 +37,6 @@ __host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
                                    Torus *input, uint32_t lwe_dimension,
                                    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -47,13 +46,6 @@ __host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
  check_cuda_error(cudaGetLastError());
 }

-/* This takes an array of lwe ciphertexts, where each is an encryption of
- * either 0 or 1.
- *
- * It writes in lwe_array_out a single lwe ciphertext encrypting 1 if all input
- * blocks are 1 otherwise the block encrypts 0
- *
- */
 template <typename Torus>
 __host__ void
 are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
@@ -61,7 +53,6 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
                               Torus *ksk, uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -71,25 +62,24 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,

  auto are_all_block_true_buffer =
      mem_ptr->eq_buffer->are_all_block_true_buffer;
-  auto tmp_out = are_all_block_true_buffer->tmp_out;

  uint32_t total_modulus = message_modulus * carry_modulus;
  uint32_t max_value = total_modulus - 1;

  cuda_memcpy_async_gpu_to_gpu(
-      tmp_out, lwe_array_in,
+      lwe_array_out, lwe_array_in,
      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);

+  int lut_num_blocks = 0;
  uint32_t remaining_blocks = num_radix_blocks;
-
-  while (remaining_blocks > 0) {
+  while (remaining_blocks > 1) {
    // Split in max_value chunks
    uint32_t chunk_length = std::min(max_value, remaining_blocks);
    int num_chunks = remaining_blocks / chunk_length;

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = tmp_out;
+    auto input_blocks = lwe_array_out;
    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
    for (int i = 0; i < num_chunks; i++) {
      accumulate_all_blocks(stream, accumulator, input_blocks,
@@ -100,109 +90,35 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
      input_blocks += (big_lwe_dimension + 1) * chunk_length;
    }
    accumulator = are_all_block_true_buffer->tmp_block_accumulated;
-    auto is_equal_to_num_blocks_map =
-        &are_all_block_true_buffer->is_equal_to_lut_map;

    // Selects a LUT
    int_radix_lut<Torus> *lut;
    if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
      // is_non_zero_lut_buffer LUT
      lut = mem_ptr->eq_buffer->is_non_zero_lut;
+    } else if (chunk_length == max_value) {
+      // is_max_value LUT
+      lut = are_all_block_true_buffer->is_max_value_lut;
    } else {
-      if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
-          (*is_equal_to_num_blocks_map).end()) {
-        // The LUT is already computed
-        lut = (*is_equal_to_num_blocks_map)[chunk_length];
-      } else {
-        // LUT needs to be computed
-        auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
-                                                num_radix_blocks, true);
-
+      // is_equal_to_num_blocks LUT
+      lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
+      if (chunk_length != lut_num_blocks) {
        auto is_equal_to_num_blocks_lut_f = [max_value,
                                             chunk_length](Torus x) -> Torus {
          return (x & max_value) == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            stream, new_lut->lut, glwe_dimension, polynomial_size,
-            message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
+            stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
+            carry_modulus, is_equal_to_num_blocks_lut_f);

-        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
-        lut = new_lut;
+        // We don't have to generate this lut again
+        lut_num_blocks = chunk_length;
      }
    }

    // Applies the LUT
-    if (remaining_blocks == 1) {
-      // In the last iteration we copy the output to the final address
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
-      return;
-    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, tmp_out, accumulator, bsk, ksk, num_chunks, lut);
-    }
-  }
-}
-
-/* This takes an array of lwe ciphertexts, where each is an encryption of
- * either 0 or 1.
- *
- * It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
- * one input ciphertext encrypts 1 otherwise encrypts 0
- */
-template <typename Torus>
-__host__ void is_at_least_one_comparisons_block_true(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
-    uint32_t num_radix_blocks) {
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
-
-  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = total_modulus - 1;
-
-  cuda_memcpy_async_gpu_to_gpu(
-      mem_ptr->tmp_lwe_array_out, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
-
-  uint32_t remaining_blocks = num_radix_blocks;
-  while (remaining_blocks > 0) {
-    // Split in max_value chunks
-    uint32_t chunk_length = std::min(max_value, remaining_blocks);
-    int num_chunks = remaining_blocks / chunk_length;
-
-    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
-    // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = mem_ptr->tmp_lwe_array_out;
-    auto accumulator = buffer->tmp_block_accumulated;
-    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(stream, accumulator, input_blocks,
-                            big_lwe_dimension, chunk_length);
-
-      accumulator += (big_lwe_dimension + 1);
-      remaining_blocks -= (chunk_length - 1);
-      input_blocks += (big_lwe_dimension + 1) * chunk_length;
-    }
-    accumulator = buffer->tmp_block_accumulated;
-
-    // Selects a LUT
-    int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
-
-    // Applies the LUT
-    if (remaining_blocks == 1) {
-      // In the last iteration we copy the output to the final address
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, lwe_array_out, accumulator, bsk, ksk, 1, lut);
-      return;
-    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          stream, mem_ptr->tmp_lwe_array_out, accumulator, bsk, ksk, num_chunks,
-          lut);
-    }
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
  }
 }

@@ -229,9 +145,8 @@ template <typename Torus>
 __host__ void host_compare_with_zero_equality(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
-    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
+    int32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto message_modulus = params.message_modulus;
@@ -260,6 +175,7 @@ __host__ void host_compare_with_zero_equality(
    num_sum_blocks = 1;
  } else {
    uint32_t remainder_blocks = num_radix_blocks;
+
    auto sum_i = sum;
    auto chunk = lwe_array_in;
    while (remainder_blocks > 1) {
@@ -273,15 +189,21 @@ __host__ void host_compare_with_zero_equality(
      remainder_blocks -= (chunk_size - 1);

      // Update operands
-      chunk += (chunk_size - 1) * big_lwe_size;
+      chunk += chunk_size * big_lwe_size;
      sum_i += big_lwe_size;
    }
  }

+  auto is_equal_to_zero_lut = mem_ptr->diff_buffer->is_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
+      stream, sum, sum, bsk, ksk, num_sum_blocks, is_equal_to_zero_lut);
  are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
                                 num_sum_blocks);
+
+  // The result will be in the two first block. Everything else is
+  //  garbage.
+  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
 }

 template <typename Torus>
@@ -290,9 +212,11 @@ __host__ void host_integer_radix_equality_check_kb(
    Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
    Torus *ksk, uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto eq_buffer = mem_ptr->eq_buffer;

+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+
  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
  integer_radix_apply_bivariate_lookup_table_kb(
@@ -301,10 +225,26 @@ __host__ void host_integer_radix_equality_check_kb(

  // This takes a Vec of blocks, where each block is either 0 or 1.
  //
-  // It returns a block encrypting 1 if all input blocks are 1
+  // It return a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
  are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
                                 bsk, ksk, num_radix_blocks);
+
+  // Zero all blocks but the first
+  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_radix_equality_check_kb(
+    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_comparison_buffer<Torus>(
+      stream, op, params, num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
@@ -362,7 +302,6 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
                    std::function<Torus(Torus)> sign_handler_f, void *bsk,
                    Torus *ksk, uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = tree_buffer->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -439,45 +378,38 @@ __host__ void host_integer_radix_difference_check_kb(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
    Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
    std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
-    uint32_t num_radix_blocks) {
+    uint32_t total_num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto diff_buffer = mem_ptr->diff_buffer;

  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto big_lwe_size = big_lwe_dimension + 1;
  auto message_modulus = params.message_modulus;
  auto carry_modulus = params.carry_modulus;

-  uint32_t packed_num_radix_blocks = num_radix_blocks;
+  uint32_t num_radix_blocks = total_num_radix_blocks;
  auto lhs = lwe_array_left;
  auto rhs = lwe_array_right;
-  if (carry_modulus >= message_modulus) {
+  if (carry_modulus == message_modulus) {
    // Packing is possible
    // Pack inputs
    Torus *packed_left = diff_buffer->tmp_packed_left;
    Torus *packed_right = diff_buffer->tmp_packed_right;
-    // In case the ciphertext is signed, the sign block and the one before it
-    // are handled separately
-    if (mem_ptr->is_signed) {
-      packed_num_radix_blocks -= 2;
-    }
    pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
-                packed_num_radix_blocks, message_modulus);
+                num_radix_blocks, message_modulus);
    pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
-                packed_num_radix_blocks, message_modulus);
+                num_radix_blocks, message_modulus);
    // From this point we have half number of blocks
-    packed_num_radix_blocks /= 2;
+    num_radix_blocks /= 2;

    // Clean noise
-    auto identity_lut = mem_ptr->identity_lut;
+    auto cleaning_lut = mem_ptr->cleaning_lut;
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_left, packed_left, bsk, ksk, packed_num_radix_blocks,
-        identity_lut);
+        stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
+        cleaning_lut);
    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_right, packed_right, bsk, ksk, packed_num_radix_blocks,
-        identity_lut);
+        stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
+        cleaning_lut);

    lhs = packed_left;
    rhs = packed_right;
@@ -488,78 +420,31 @@ __host__ void host_integer_radix_difference_check_kb(
  // - 1 if lhs == rhs
  // - 2 if lhs > rhs
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  auto num_comparisons = 0;
-  if (!mem_ptr->is_signed) {
-    // Compare packed blocks, or simply the total number of radix blocks in the
-    // inputs
-    compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
-                            packed_num_radix_blocks);
-    num_comparisons = packed_num_radix_blocks;
-  } else {
-    // Packing is possible
-    if (carry_modulus >= message_modulus) {
-      // Compare (num_radix_blocks - 2) / 2 packed blocks
-      compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
-                              packed_num_radix_blocks);
-
-      // Compare the last block before the sign block separately
-      auto identity_lut = mem_ptr->identity_lut;
-      Torus *last_left_block_before_sign_block =
-          diff_buffer->tmp_packed_left + packed_num_radix_blocks * big_lwe_size;
-      Torus *last_right_block_before_sign_block =
-          diff_buffer->tmp_packed_right +
-          packed_num_radix_blocks * big_lwe_size;
-      integer_radix_apply_univariate_lookup_table_kb(
-          stream, last_left_block_before_sign_block,
-          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
-          identity_lut);
-      integer_radix_apply_univariate_lookup_table_kb(
-          stream, last_right_block_before_sign_block,
-          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsk, ksk, 1,
-          identity_lut);
-      compare_radix_blocks_kb(
-          stream, comparisons + packed_num_radix_blocks * big_lwe_size,
-          last_left_block_before_sign_block, last_right_block_before_sign_block,
-          mem_ptr, bsk, ksk, 1);
-      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
-          stream, comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
-          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
-          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
-          mem_ptr->signed_lut);
-      num_comparisons = packed_num_radix_blocks + 2;
-
-    } else {
-      compare_radix_blocks_kb(stream, comparisons, lwe_array_left,
-                              lwe_array_right, mem_ptr, bsk, ksk,
-                              num_radix_blocks - 1);
-      // Compare the sign block separately
-      integer_radix_apply_bivariate_lookup_table_kb(
-          stream, comparisons + (num_radix_blocks - 1) * big_lwe_size,
-          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
-          lwe_array_right + (num_radix_blocks - 1) * big_lwe_size, bsk, ksk, 1,
-          mem_ptr->signed_lut);
-      num_comparisons = num_radix_blocks;
-    }
-  }
+  compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
+                          num_radix_blocks);

  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
  tree_sign_reduction(stream, lwe_array_out, comparisons,
                      mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
-                      ksk, num_comparisons);
+                      ksk, num_radix_blocks);
+
+  // The result will be in the first block. Everything else is garbage.
+  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                    (total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
 }

 template <typename Torus>
-__host__ void scratch_cuda_integer_radix_comparison_check_kb(
+__host__ void scratch_cuda_integer_radix_difference_check_kb(
    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
-    bool is_signed, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
  *mem_ptr = new int_comparison_buffer<Torus>(
-      stream, op, params, num_radix_blocks, is_signed, allocate_gpu_memory);
+      stream, op, params, num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
@@ -569,11 +454,10 @@ host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
                             int_comparison_buffer<Torus> *mem_ptr, void *bsk,
                             Torus *ksk, uint32_t total_num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  // Compute the sign
  host_integer_radix_difference_check_kb(
      stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
-      mem_ptr, mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks);
+      mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);

  // Selector
  host_integer_radix_cmux_kb(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -59,9 +59,7 @@ void cuda_full_propagation_64_inplace(
        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
    break;
  default:
-    PANIC("Cuda error (full propagation inplace): unsupported polynomial size. "
-          "Supported N's are powers of two"
-          " in the interval [256..16384].")
+    break;
  }
 }

@@ -88,26 +86,13 @@ void cleanup_cuda_full_propagation(cuda_stream_t *stream,
  cuda_drop_async(mem_ptr->lut_buffer, stream);
  cuda_drop_async(mem_ptr->lut_indexes, stream);

-  cuda_drop_async(mem_ptr->lwe_indexes, stream);
+  cuda_drop_async(mem_ptr->pbs_buffer, stream);

  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
-
-  switch (mem_ptr->pbs_type) {
-  case CLASSICAL: {
-    auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(mem_ptr->pbs_buffer);
-    x->release(stream);
-  } break;
-  case MULTI_BIT: {
-    auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
-    x->release(stream);
-  } break;
-  default:
-    PANIC("Cuda error (PBS): unsupported implementation variant.")
-  }
 }

-void scratch_cuda_propagate_single_carry_kb_64_inplace(
+void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
@@ -120,23 +105,22 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus);

-  scratch_cuda_propagate_single_carry_kb_inplace(
+  scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
      stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
      allocate_gpu_memory);
 }

-void cuda_propagate_single_carry_kb_64_inplace(cuda_stream_t *stream,
-                                               void *lwe_array, int8_t *mem_ptr,
-                                               void *bsk, void *ksk,
-                                               uint32_t num_blocks) {
-  host_propagate_single_carry<uint64_t>(
+void cuda_propagate_single_carry_low_latency_kb_64_inplace(
+    cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
+    void *ksk, uint32_t num_blocks) {
+  host_propagate_single_carry_low_latency<uint64_t>(
      stream, static_cast<uint64_t *>(lwe_array),
      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
      static_cast<uint64_t *>(ksk), num_blocks);
 }

-void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
-                                         int8_t **mem_ptr_void) {
+void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
+                                                     int8_t **mem_ptr_void) {
  int_sc_prop_memory<uint64_t> *mem_ptr =
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release(stream);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -7,12 +7,83 @@
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
 #include "polynomial/functions.cuh"
-#include "programmable_bootstrap.h"
-#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <functional>

+template <typename Torus>
+void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
+                 Torus *lwe_output_indexes, Torus *lut_vector,
+                 Torus *lut_vector_indexes, Torus *lwe_array_in,
+                 Torus *lwe_input_indexes, void *bootstrapping_key,
+                 int8_t *pbs_buffer, uint32_t glwe_dimension,
+                 uint32_t lwe_dimension, uint32_t polynomial_size,
+                 uint32_t base_log, uint32_t level_count,
+                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+                 uint32_t num_luts, uint32_t lwe_idx,
+                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
+  if (sizeof(Torus) == sizeof(uint32_t)) {
+    // 32 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      printf("multibit\n");
+      printf("Error: 32-bit multibit PBS is not supported.\n");
+      break;
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      break;
+    }
+  } else {
+    // 64 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, grouping_factor, base_log, level_count,
+          input_lwe_ciphertext_count, num_luts, lwe_idx,
+          max_shared_memory);
+      break;
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      break;
+    }
+  }
+}
+
 // function rotates right  radix ciphertext with specific value
 // grid is one dimensional
 // blockIdx.x represents x_th block of radix ciphertext
@@ -62,40 +133,35 @@ __global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
 // polynomial_size threads
 template <typename Torus>
 __global__ void
-device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_indexes_out,
-                             Torus *lwe_array_1, Torus *lwe_array_2,
-                             Torus *lwe_indexes_in, uint32_t lwe_dimension,
-                             uint32_t shift, uint32_t num_blocks) {
+device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_array_1,
+                             Torus *lwe_array_2, Torus *lwe_indexes,
+                             uint32_t lwe_dimension, uint32_t message_modulus,
+                             uint32_t num_blocks) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;

  if (tid < num_blocks * (lwe_dimension + 1)) {
    int block_id = tid / (lwe_dimension + 1);
    int coeff_id = tid % (lwe_dimension + 1);

-    int pos_in = lwe_indexes_in[block_id] * (lwe_dimension + 1) + coeff_id;
-    int pos_out = lwe_indexes_out[block_id] * (lwe_dimension + 1) + coeff_id;
-    lwe_array_out[pos_out] = lwe_array_1[pos_in] * shift + lwe_array_2[pos_in];
+    int pos = lwe_indexes[block_id] * (lwe_dimension + 1) + coeff_id;
+    lwe_array_out[pos] = lwe_array_1[pos] * message_modulus + lwe_array_2[pos];
  }
 }

-/* Combine lwe_array_1 and lwe_array_2 so that each block m1 and m2
- *  becomes out = m1 * shift + m2
- */
 template <typename Torus>
 __host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
-                                    Torus *lwe_indexes_out, Torus *lwe_array_1,
-                                    Torus *lwe_array_2, Torus *lwe_indexes_in,
-                                    uint32_t lwe_dimension, uint32_t shift,
+                                    Torus *lwe_array_1, Torus *lwe_array_2,
+                                    Torus *lwe_indexes, uint32_t lwe_dimension,
+                                    uint32_t message_modulus,
                                    uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  // Left message is shifted
  int num_blocks = 0, num_threads = 0;
  int num_entries = num_radix_blocks * (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
-      lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
-      lwe_dimension, shift, num_radix_blocks);
+      lwe_array_out, lwe_array_1, lwe_array_2, lwe_indexes, lwe_dimension,
+      message_modulus, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
 }

@@ -103,7 +169,6 @@ template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
    Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
-  cudaSetDevice(stream->gpu_index);
  // apply_lookup_table
  auto params = lut->params;
  auto pbs_type = params.pbs_type;
@@ -119,16 +184,16 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(

  // Compute Keyswitch-PBS
  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_in,
-      lut->lwe_indexes_in, ksk, big_lwe_dimension, small_lwe_dimension,
+      stream, lut->tmp_lwe_after_ks, lut->lwe_indexes, lwe_array_in,
+      lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
      ks_base_log, ks_level, num_radix_blocks);

-  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
-                     lut->lut_indexes, lut->tmp_lwe_after_ks,
-                     lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
-                     small_lwe_dimension, polynomial_size, pbs_base_log,
-                     pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+  execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
+              lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
+              lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
+              polynomial_size, pbs_base_log, pbs_level, grouping_factor,
+              num_radix_blocks, 1, 0,
+              cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
 }

 template <typename Torus>
@@ -136,40 +201,22 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
    Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
    int_radix_lut<Torus> *lut) {
-  cudaSetDevice(stream->gpu_index);
  // apply_lookup_table_bivariate
+
  auto params = lut->params;
-  auto pbs_type = params.pbs_type;
  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto small_lwe_dimension = params.small_lwe_dimension;
-  auto ks_level = params.ks_level;
-  auto ks_base_log = params.ks_base_log;
-  auto pbs_level = params.pbs_level;
-  auto pbs_base_log = params.pbs_base_log;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto grouping_factor = params.grouping_factor;
  auto message_modulus = params.message_modulus;

  // Left message is shifted
-  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
-  pack_bivariate_blocks(stream, lwe_array_pbs_in, lut->lwe_trivial_indexes,
-                        lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
-                        big_lwe_dimension, message_modulus, num_radix_blocks);
+  pack_bivariate_blocks(stream, lut->tmp_lwe_before_ks, lwe_array_1,
+                        lwe_array_2, lut->lwe_indexes, big_lwe_dimension,
+                        message_modulus, num_radix_blocks);
  check_cuda_error(cudaGetLastError());

  // Apply LUT
-  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_pbs_in,
-      lut->lwe_trivial_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
-      ks_base_log, ks_level, num_radix_blocks);
-
-  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
-                     lut->lut_indexes, lut->tmp_lwe_after_ks,
-                     lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
-                     small_lwe_dimension, polynomial_size, pbs_base_log,
-                     pbs_level, grouping_factor, num_radix_blocks, 1, 0,
-                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
+                                                 lut->tmp_lwe_before_ks, bsk,
+                                                 ksk, num_radix_blocks, lut);
 }

 // Rotates the slice in-place such that the first mid elements of the slice move
@@ -261,8 +308,8 @@ void generate_device_accumulator_bivariate(
      acc_bivariate, h_lut,
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);

-  // Release memory when possible
-  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
+  cuda_synchronize_stream(stream);
+  free(h_lut);
 }

 /*
@@ -293,12 +340,12 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream);

-  // Release memory when possible
-  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
+  cuda_synchronize_stream(stream);
+  free(h_lut);
 }

 template <typename Torus>
-void scratch_cuda_propagate_single_carry_kb_inplace(
+void scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
    cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    bool allocate_gpu_memory) {
@@ -308,12 +355,15 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
 }

 template <typename Torus>
-void host_propagate_single_carry(cuda_stream_t *stream, Torus *lwe_array,
-                                 int_sc_prop_memory<Torus> *mem, void *bsk,
-                                 Torus *ksk, uint32_t num_blocks) {
+void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
+                                             Torus *lwe_array,
+                                             int_sc_prop_memory<Torus> *mem,
+                                             void *bsk, Torus *ksk,
+                                             uint32_t num_blocks) {
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
  auto big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

@@ -361,71 +411,13 @@ void host_propagate_single_carry(cuda_stream_t *stream, Torus *lwe_array,
      stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
 }

-template <typename Torus>
-void host_propagate_single_sub_borrow(cuda_stream_t *stream, Torus *overflowed,
-                                      Torus *lwe_array,
-                                      int_single_borrow_prop_memory<Torus> *mem,
-                                      void *bsk, Torus *ksk,
-                                      uint32_t num_blocks) {
-  auto params = mem->params;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto generates_or_propagates = mem->generates_or_propagates;
-  auto step_output = mem->step_output;
-
-  auto luts_array = mem->luts_array;
-  auto luts_carry_propagation_sum = mem->luts_borrow_propagation_sum;
-  auto message_acc = mem->message_acc;
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
-      luts_array);
-
-  // compute prefix sum with hillis&steele
-  int num_steps = ceil(log2((double)num_blocks));
-  int space = 1;
-  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
-                               big_lwe_size_bytes * num_blocks, stream);
-
-  for (int step = 0; step < num_steps; step++) {
-    auto cur_blocks = &step_output[space * big_lwe_size];
-    auto prev_blocks = generates_or_propagates;
-    int cur_total_blocks = num_blocks - space;
-
-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
-        luts_carry_propagation_sum);
-
-    cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
-                                 cur_blocks,
-                                 big_lwe_size_bytes * cur_total_blocks, stream);
-    space *= 2;
-  }
-
-  cuda_memcpy_async_gpu_to_gpu(
-      overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
-      big_lwe_size_bytes, stream);
-
-  radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
-      step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
-  cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
-
-  host_subtraction(stream, lwe_array, lwe_array, step_output,
-                   glwe_dimension * polynomial_size, num_blocks);
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
-}
-
 /*
 * input_blocks: input radix ciphertext propagation will happen inplace
 * acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
- * lut_indexes_message_carry: lut_indexes for message and carry, should always
- * be  {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
- * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
+ * lut_indexes_message_carry: lut_indexes for message and carry, should always be  {0, 1}
+ * small_lwe_vector: output of keyswitch should have
+ *     size = 2 * (lwe_dimension + 1) * sizeof(Torus)
+ * big_lwe_vector: output of pbs should have
 *     size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
 */
 template <typename Torus, typename STorus, class params>
@@ -482,12 +474,31 @@ void scratch_cuda_full_propagation(
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory) {

+  // PBS
  int8_t *pbs_buffer;
-  execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
-                             polynomial_size, pbs_level, grouping_factor,
-                             num_radix_blocks,
-                             cuda_get_max_shared_memory(stream->gpu_index),
-                             pbs_type, allocate_gpu_memory);
+  if (pbs_type == MULTI_BIT) {
+    uint32_t lwe_chunk_size = get_average_lwe_chunk_size(
+        lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks);
+    // Only 64 bits is supported
+    scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
+                                  glwe_dimension, polynomial_size, pbs_level,
+                                  grouping_factor, num_radix_blocks,
+                                  cuda_get_max_shared_memory(stream->gpu_index),
+                                  allocate_gpu_memory, lwe_chunk_size);
+  } else {
+    // Classic
+    // We only use low latency for classic mode
+    if (sizeof(Torus) == sizeof(uint32_t))
+      scratch_cuda_bootstrap_low_latency_32(
+          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
+          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
+          allocate_gpu_memory);
+    else
+      scratch_cuda_bootstrap_low_latency_64(
+          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
+          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
+          allocate_gpu_memory);
+  }

  // LUT
  Torus *lut_buffer;
@@ -540,8 +551,8 @@ void scratch_cuda_full_propagation(
      h_lwe_indexes[i] = i;
    cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
                             stream);
-    cuda_stream_add_callback(stream, host_free_on_stream_callback,
-                             h_lwe_indexes);
+    cuda_synchronize_stream(stream);
+    free(h_lwe_indexes);
  }

  // Temporary arrays
@@ -587,7 +598,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
      packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
    }

-    if (num_radix_blocks % 2 == 1) {
+    if (num_radix_blocks % 2 != 0) {
      // We couldn't pack the last block, so we just copy it
      Torus *lsb_block =
          lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
@@ -610,11 +621,7 @@ template <typename Torus>
 __host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
                          Torus *lwe_array_in, uint32_t lwe_dimension,
                          uint32_t num_radix_blocks, uint32_t factor) {
-  if (lwe_array_out == lwe_array_in)
-    PANIC("Cuda error in pack blocks: input and output pointers must be "
-          "different.");
-
-  cudaSetDevice(stream->gpu_index);
+  assert(lwe_array_out != lwe_array_in);

  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
@@ -644,7 +651,6 @@ create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
                     uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
                     uint64_t message_modulus, uint64_t carry_modulus) {

-  cudaSetDevice(stream->gpu_index);
  size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
  cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);

@@ -668,107 +674,4 @@ create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
  check_cuda_error(cudaGetLastError());
 }

-/**
- * Each bit in lwe_array_in becomes a lwe ciphertext in lwe_array_out
- * Thus, lwe_array_out must be allocated with num_radix_blocks * bits_per_block
- * * (lwe_dimension+1) * sizeeof(Torus) bytes
- */
-template <typename Torus>
-__host__ void extract_n_bits(cuda_stream_t *stream, Torus *lwe_array_out,
-                             Torus *lwe_array_in, void *bsk, Torus *ksk,
-                             uint32_t num_radix_blocks, uint32_t bits_per_block,
-                             int_bit_extract_luts_buffer<Torus> *bit_extract) {
-
-  integer_radix_apply_univariate_lookup_table_kb(
-      stream, lwe_array_out, lwe_array_in, bsk, ksk,
-      num_radix_blocks * bits_per_block, bit_extract->lut);
-}
-
-template <typename Torus>
-__host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
-                           Torus *signs_array_in,
-                           int_comparison_buffer<Torus> *mem_ptr,
-                           std::function<Torus(Torus)> sign_handler_f,
-                           void *bsk, Torus *ksk, uint32_t num_sign_blocks) {
-
-  auto diff_buffer = mem_ptr->diff_buffer;
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  std::function<Torus(Torus)> reduce_two_orderings_function =
-      [diff_buffer, sign_handler_f](Torus x) -> Torus {
-    int msb = (x >> 2) & 3;
-    int lsb = x & 3;
-
-    return diff_buffer->tree_buffer->block_selector_f(msb, lsb);
-  };
-
-  auto signs_a = diff_buffer->tmp_signs_a;
-  auto signs_b = diff_buffer->tmp_signs_b;
-
-  cuda_memcpy_async_gpu_to_gpu(
-      signs_a, signs_array_in,
-      (big_lwe_dimension + 1) * num_sign_blocks * sizeof(Torus), stream);
-  if (num_sign_blocks > 2) {
-    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator<Torus>(
-        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-        carry_modulus, reduce_two_orderings_function);
-
-    while (num_sign_blocks > 2) {
-      pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, num_sign_blocks,
-                  4);
-      integer_radix_apply_univariate_lookup_table_kb(
-          stream, signs_a, signs_b, bsk, ksk, num_sign_blocks / 2, lut);
-
-      auto last_block_signs_b =
-          signs_b + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
-      auto last_block_signs_a =
-          signs_a + (num_sign_blocks / 2) * (big_lwe_dimension + 1);
-      if (num_sign_blocks % 2 == 1)
-        cuda_memcpy_async_gpu_to_gpu(last_block_signs_a, last_block_signs_b,
-                                     (big_lwe_dimension + 1) * sizeof(Torus),
-                                     stream);
-
-      num_sign_blocks = (num_sign_blocks / 2) + (num_sign_blocks % 2);
-    }
-  }
-
-  if (num_sign_blocks == 2) {
-    std::function<Torus(Torus)> final_lut_f =
-        [reduce_two_orderings_function, sign_handler_f](Torus x) -> Torus {
-      Torus final_sign = reduce_two_orderings_function(x);
-      return sign_handler_f(final_sign);
-    };
-
-    auto lut = diff_buffer->reduce_signs_lut;
-    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, final_lut_f);
-
-    pack_blocks(stream, signs_b, signs_a, big_lwe_dimension, 2, 4);
-    integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
-                                                   signs_b, bsk, ksk, 1, lut);
-
-  } else {
-
-    std::function<Torus(Torus)> final_lut_f =
-        [mem_ptr, sign_handler_f](Torus x) -> Torus {
-      return sign_handler_f(x & 3);
-    };
-
-    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
-    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, final_lut_f);
-
-    integer_radix_apply_univariate_lookup_table_kb(stream, signs_array_out,
-                                                   signs_a, bsk, ksk, 1, lut);
-  }
-}
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -1,66 +1,5 @@
 #include "integer/multiplication.cuh"

-/*
- * when adding chunk_size times terms together, there might be some blocks
- * where addition have not happened or degree is zero, in that case we don't
- * need to apply lookup table, so we find the indexes of the blocks where
- * addition happened and store them inside h_lwe_idx_in, from same block
- * might be extracted message and carry(if it is not the last block), so
- * one block id might have two output id and we store them in h_lwe_idx_out
- * blocks that do not require applying lookup table might be copied on both
- * message and carry side or be replaced with zero ciphertexts, indexes of such
- * blocks are stored inside h_smart_copy_in as input ids and h_smart_copy_out
- * as output ids, -1 value as an input id means that zero ciphertext will be
- * copied on output index.
- */
-void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in,
-                                 size_t *h_lwe_idx_out,
-                                 int32_t *h_smart_copy_in,
-                                 int32_t *h_smart_copy_out, size_t ch_amount,
-                                 uint32_t num_radix, uint32_t num_blocks,
-                                 size_t chunk_size, size_t message_max,
-                                 size_t &total_count, size_t &message_count,
-                                 size_t &carry_count, size_t &sm_copy_count) {
-  for (size_t c_id = 0; c_id < ch_amount; c_id++) {
-    auto cur_chunk = &terms_degree[c_id * chunk_size * num_blocks];
-    for (size_t r_id = 0; r_id < num_blocks; r_id++) {
-      size_t new_degree = 0;
-      for (size_t chunk_id = 0; chunk_id < chunk_size; chunk_id++) {
-        new_degree += cur_chunk[chunk_id * num_blocks + r_id];
-      }
-
-      if (new_degree > message_max) {
-        h_lwe_idx_in[message_count] = c_id * num_blocks + r_id;
-        h_lwe_idx_out[message_count] = c_id * num_blocks + r_id;
-        message_count++;
-      } else {
-        h_smart_copy_in[sm_copy_count] = c_id * num_blocks + r_id;
-        h_smart_copy_out[sm_copy_count] = c_id * num_blocks + r_id;
-        sm_copy_count++;
-      }
-    }
-  }
-  for (size_t i = 0; i < sm_copy_count; i++) {
-    h_smart_copy_in[i] = -1;
-    h_smart_copy_out[i] = h_smart_copy_out[i] + ch_amount * num_blocks + 1;
-  }
-
-  for (size_t i = 0; i < message_count; i++) {
-    if (h_lwe_idx_in[i] % num_blocks != num_blocks - 1) {
-      h_lwe_idx_in[message_count + carry_count] = h_lwe_idx_in[i];
-      h_lwe_idx_out[message_count + carry_count] =
-          ch_amount * num_blocks + h_lwe_idx_in[i] + 1;
-      carry_count++;
-    } else {
-      h_smart_copy_in[sm_copy_count] = -1;
-      h_smart_copy_out[sm_copy_count] =
-          h_lwe_idx_in[i] - (num_blocks - 1) + ch_amount * num_blocks;
-      sm_copy_count++;
-    }
-  }
-
-  total_count = message_count + carry_count;
-}
 /*
 * This scratch function allocates the necessary amount of data on the GPU for
 * the integer radix multiplication in keyswitch->bootstrap order.
@@ -74,9 +13,9 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          polynomial_size * glwe_dimension, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus);
+                          polynomial_size, lwe_dimension, ks_level, ks_base_log,
+                          pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);

  switch (polynomial_size) {
  case 2048:
@@ -85,8 +24,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
        allocate_gpu_memory);
    break;
  default:
-    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Only N = 2048 is supported")
+    break;
  }
 }

@@ -137,8 +75,7 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        num_blocks);
    break;
  default:
-    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Only N = 2048 is supported")
+    break;
  }
 }

@@ -150,92 +87,21 @@ void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
  mem_ptr->release(stream);
 }

-void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks_in_radix,
-    uint32_t max_num_radix_in_vec, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+void cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
+    cuda_stream_t *stream, void *lwe_array, uint64_t scalar,
+    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus);
-  scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
-      stream, (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr,
-      num_blocks_in_radix, max_num_radix_in_vec, params, allocate_gpu_memory);
+  cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
+      stream, lwe_array, lwe_array, scalar, lwe_dimension,
+      lwe_ciphertext_count);
 }

-void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
-    cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_vec,
-    uint32_t num_radix_in_vec, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t num_blocks_in_radix) {
+void cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
+    cuda_stream_t *stream, void *output_lwe_array, void *input_lwe_array,
+    uint64_t scalar, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {

-  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
-
-  int *terms_degree =
-      (int *)malloc(num_blocks_in_radix * num_radix_in_vec * sizeof(int));
-
-  for (int i = 0; i < num_radix_in_vec * num_blocks_in_radix; i++) {
-    terms_degree[i] = mem->params.message_modulus - 1;
-  }
-
-  switch (mem->params.polynomial_size) {
-  case 512:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
-    break;
-  case 1024:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
-    break;
-  case 2048:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
-    break;
-  case 4096:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
-    break;
-  case 8192:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
-    break;
-  case 16384:
-    host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks_in_radix,
-        num_radix_in_vec);
-    break;
-  default:
-    PANIC("Cuda error (integer sum ciphertexts): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
-  }
-
-  free(terms_degree);
-}
-
-void cleanup_cuda_integer_radix_sum_ciphertexts_vec(cuda_stream_t *stream,
-                                                    int8_t **mem_ptr_void) {
-  int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
-      (int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(stream);
+  host_integer_small_scalar_mult_radix(
+      stream, static_cast<uint64_t *>(output_lwe_array),
+      static_cast<uint64_t *>(input_lwe_array), scalar, lwe_dimension,
+      lwe_ciphertext_count);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -6,12 +6,16 @@
 #include <cuda_runtime.h>
 #endif

+#include "bootstrap.h"
+#include "bootstrap_multibit.h"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.h"
 #include "integer/integer.cuh"
 #include "linear_algebra.h"
-#include "programmable_bootstrap.h"
+#include "pbs/bootstrap_amortized.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
@@ -21,24 +25,6 @@
 #include <string>
 #include <vector>

-template <typename Torus>
-__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
-                           int32_t *id_in, size_t lwe_size) {
-  size_t tid = threadIdx.x;
-  size_t b_id = blockIdx.x;
-  size_t stride = blockDim.x;
-
-  auto input_id = id_in[b_id];
-  auto output_id = id_out[b_id];
-
-  auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
-  auto cur_dst = &dst[output_id * lwe_size];
-
-  for (int i = tid; i < lwe_size; i += stride) {
-    cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
-  }
-}
-
 template <typename Torus, class params>
 __global__ void
 all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
@@ -92,37 +78,98 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
 }

 template <typename Torus>
+void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
+                                    Torus *dst, int *S, int *F, int num_blocks,
+                                    uint32_t map_size, uint32_t unit_size,
+                                    int &total_copied, bool is_message) {
+  for (int i = 0; i < map_size; i++) {
+    int s_index = i * num_blocks + S[i];
+    int number_of_unit = F[i] - S[i] + is_message;
+    auto cur_dst = &dst[total_copied * unit_size];
+    auto cur_src = &src[s_index * unit_size];
+    size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
+    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
+    total_copied += number_of_unit;
+  }
+}
+
+template <typename Torus>
+void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
+                                         Torus *dst, int *S, int *F,
+                                         uint32_t map_size, uint32_t unit_size,
+                                         int &total_copied,
+                                         int &total_radix_copied,
+                                         int num_blocks, bool is_message) {
+  size_t radix_size = unit_size * num_blocks;
+  for (int i = 0; i < map_size; i++) {
+    auto cur_dst_radix = &dst[total_radix_copied * radix_size];
+
+    int s_index = S[i];
+    int number_of_unit = F[i] - s_index + is_message;
+
+    if (!is_message) {
+      int zero_block_count = num_blocks - number_of_unit;
+      cuda_memset_async(cur_dst_radix, 0,
+                        zero_block_count * unit_size * sizeof(Torus), stream);
+      s_index = zero_block_count;
+    }
+
+    auto cur_dst = &cur_dst_radix[s_index * unit_size];
+    auto cur_src = &src[total_copied * unit_size];
+
+    size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
+    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
+    total_copied += number_of_unit;
+    ++total_radix_copied;
+  }
+}
+
+template <typename Torus, class params>
 __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
-                                uint32_t chunk_size, uint32_t block_size,
-                                uint32_t num_blocks) {
+                                uint32_t chunk_size, uint32_t num_blocks) {

  extern __shared__ Torus result[];
-  size_t stride = blockDim.x;
  size_t chunk_id = blockIdx.x;
-  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
-  size_t radix_elem_size = num_blocks * block_size;
+  size_t chunk_elem_size = chunk_size * num_blocks * (params::degree + 1);
+  size_t radix_elem_size = num_blocks * (params::degree + 1);
  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
-  size_t block_stride = blockIdx.y * block_size;
+  size_t block_stride = blockIdx.y * (params::degree + 1);
  auto dst_block = &dst_radix[block_stride];

  // init shared mem with first radix of chunk
  size_t tid = threadIdx.x;
-  for (int i = tid; i < block_size; i += stride) {
-    result[i] = src_chunk[block_stride + i];
+  for (int i = 0; i < params::opt; i++) {
+    result[tid] = src_chunk[block_stride + tid];
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    result[params::degree] = src_chunk[block_stride + params::degree];
  }

  // accumulate rest  of the radixes
  for (int r_id = 1; r_id < chunk_size; r_id++) {
    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
-    for (int i = tid; i < block_size; i += stride) {
-      result[i] += cur_src_radix[block_stride + i];
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      result[tid] += cur_src_radix[block_stride + tid];
+      tid += params::degree / params::opt;
+    }
+    if (threadIdx.x == 0) {
+      result[params::degree] += cur_src_radix[block_stride + params::degree];
    }
  }

  // put result from shared mem to global mem
-  for (int i = tid; i < block_size; i += stride) {
-    dst_block[i] = result[i];
+  tid = threadIdx.x;
+  for (int i = 0; i < params::opt; i++) {
+    dst_block[tid] = result[tid];
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    dst_block[params::degree] = result[params::degree];
  }
 }

@@ -173,142 +220,6 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
        (process_msb) ? cur_msb_ct[params::degree] : 0;
  }
 }
-template <typename Torus>
-__host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
-    cuda_stream_t *stream, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
-    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
-    int_radix_params params, bool allocate_gpu_memory) {
-
-  cudaSetDevice(stream->gpu_index);
-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
-      stream, params, num_blocks_in_radix, max_num_radix_in_vec,
-      allocate_gpu_memory);
-}
-
-template <typename Torus, class params>
-__host__ void host_integer_sum_ciphertexts_vec_kb(
-    cuda_stream_t *stream, Torus *radix_lwe_out, Torus *terms,
-    int *terms_degree, void *bsk, uint64_t *ksk,
-    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec) {
-
-  cudaSetDevice(stream->gpu_index);
-  auto new_blocks = mem_ptr->new_blocks;
-  auto old_blocks = mem_ptr->old_blocks;
-  auto small_lwe_vector = mem_ptr->small_lwe_vector;
-
-  auto luts_message_carry = mem_ptr->luts_message_carry;
-
-  auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
-  auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
-
-  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
-  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
-
-  auto message_modulus = mem_ptr->params.message_modulus;
-  auto carry_modulus = mem_ptr->params.carry_modulus;
-  auto num_blocks = num_blocks_in_radix;
-  auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
-  auto glwe_dimension = mem_ptr->params.glwe_dimension;
-  auto polynomial_size = mem_ptr->params.polynomial_size;
-  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
-
-  if (old_blocks != terms) {
-    cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
-                                 num_blocks_in_radix * num_radix_in_vec *
-                                     big_lwe_size * sizeof(Torus),
-                                 stream);
-  }
-
-  size_t r = num_radix_in_vec;
-  size_t total_modulus = message_modulus * carry_modulus;
-  size_t message_max = message_modulus - 1;
-  size_t chunk_size = (total_modulus - 1) / message_max;
-
-  size_t h_lwe_idx_in[r * num_blocks];
-  size_t h_lwe_idx_out[r * num_blocks];
-  int32_t h_smart_copy_in[r * num_blocks];
-  int32_t h_smart_copy_out[r * num_blocks];
-
-  auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
-
-  while (r > 2) {
-    size_t cur_total_blocks = r * num_blocks;
-    size_t ch_amount = r / chunk_size;
-    if (!ch_amount)
-      ch_amount++;
-    dim3 add_grid(ch_amount, num_blocks, 1);
-    size_t sm_size = big_lwe_size * sizeof(Torus);
-
-    tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
-        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
-
-    size_t total_count = 0;
-    size_t message_count = 0;
-    size_t carry_count = 0;
-    size_t sm_copy_count = 0;
-
-    generate_ids_update_degrees(
-        terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
-        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
-        total_count, message_count, carry_count, sm_copy_count);
-
-    size_t copy_size = total_count * sizeof(Torus);
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, stream);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, stream);
-    copy_size = sm_copy_count * sizeof(int32_t);
-    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
-                             stream);
-    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
-                             stream);
-
-    smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
-        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
-        big_lwe_size);
-
-    if (carry_count > 0)
-      cuda_set_value_async<Torus>(
-          &(stream->stream), luts_message_carry->get_lut_indexes(message_count),
-          1, carry_count);
-
-    cuda_keyswitch_lwe_ciphertext_vector(
-        stream, small_lwe_vector, lwe_indexes_in, new_blocks, lwe_indexes_in,
-        ksk, polynomial_size * glwe_dimension, lwe_dimension,
-        mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
-
-    execute_pbs<Torus>(
-        stream, new_blocks, lwe_indexes_out, luts_message_carry->lut,
-        luts_message_carry->lut_indexes, small_lwe_vector, lwe_indexes_in, bsk,
-        luts_message_carry->buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, mem_ptr->params.pbs_base_log,
-        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count,
-        2, 0, max_shared_memory, mem_ptr->params.pbs_type);
-
-    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
-    int new_blocks_created = 2 * ch_amount * num_blocks;
-    copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
-
-    auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
-    auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
-    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
-    std::swap(new_blocks, old_blocks);
-    r = (new_blocks_created + rem_blocks) / num_blocks;
-  }
-
-  host_addition(stream, radix_lwe_out, old_blocks,
-                &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
-                num_blocks);
-
-  host_propagate_single_carry<Torus>(stream, radix_lwe_out, mem_ptr->scp_mem,
-                                     bsk, ksk, num_blocks);
-}

 template <typename Torus, typename STorus, class params>
 __host__ void host_integer_mult_radix_kb(
@@ -316,7 +227,6 @@ __host__ void host_integer_mult_radix_kb(
    uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
@@ -324,6 +234,7 @@ __host__ void host_integer_mult_radix_kb(
  auto carry_modulus = mem_ptr->params.carry_modulus;

  int big_lwe_dimension = glwe_dimension * polynomial_size;
+  int big_lwe_size = big_lwe_dimension + 1;

  // 'vector_result_lsb' contains blocks from all possible right shifts of
  // radix_lwe_left, only nonzero blocks are kept
@@ -366,11 +277,27 @@ __host__ void host_integer_mult_radix_kb(
  // lwe_dimension +1 coefficients
  auto small_lwe_vector = mem_ptr->small_lwe_vector;

+  // buffer to keep pbs result for num_blocks^2 lwe_ciphertext
+  // in total it has num_blocks^2 big lwe ciphertexts with
+  // glwe_dimension * polynomial_size + 1 coefficients
+  auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;
+
  // it contains two lut, first for lsb extraction,
  // second for msb extraction, with total length =
  // 2 * (glwe_dimension + 1) * polynomial_size
  auto luts_array = mem_ptr->luts_array;

+  // accumulator to extract message
+  // with length (glwe_dimension + 1) * polynomial_size
+  auto luts_message = mem_ptr->luts_message;
+
+  // accumulator to extract carry
+  // with length (glwe_dimension + 1) * polynomial_size
+  auto luts_carry = mem_ptr->luts_carry;
+
+  // to be used as default indexing
+  auto lwe_indexes = luts_array->lwe_indexes;
+
  auto vector_result_lsb = &vector_result_sb[0];
  auto vector_result_msb =
      &vector_result_sb[lsb_vector_block_count *
@@ -402,22 +329,144 @@ __host__ void host_integer_mult_radix_kb(
                           lsb_vector_block_count, msb_vector_block_count,
                           num_blocks);

-  int terms_degree[2 * num_blocks * num_blocks];
+  auto new_blocks = block_mul_res;
+  auto old_blocks = vector_result_sb;
+
+  // amount of current radixes after block_mul
+  size_t r = 2 * num_blocks;
+
+  size_t total_modulus = message_modulus * carry_modulus;
+  size_t message_max = message_modulus - 1;
+  size_t chunk_size = (total_modulus - 1) / message_max;
+  size_t ch_amount = r / chunk_size;
+
+  int terms_degree[r * num_blocks];
+  int f_b[ch_amount];
+  int l_b[ch_amount];
+
  for (int i = 0; i < num_blocks * num_blocks; i++) {
    size_t r_id = i / num_blocks;
    size_t b_id = i % num_blocks;
-    terms_degree[i] = (b_id >= r_id) ? message_modulus - 1 : 0;
+    terms_degree[i] = (b_id >= r_id) ? 3 : 0;
  }
  auto terms_degree_msb = &terms_degree[num_blocks * num_blocks];
  for (int i = 0; i < num_blocks * num_blocks; i++) {
    size_t r_id = i / num_blocks;
    size_t b_id = i % num_blocks;
-    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
+    terms_degree_msb[i] = (b_id > r_id) ? 2 : 0;
  }

-  host_integer_sum_ciphertexts_vec_kb<Torus, params>(
-      stream, radix_lwe_out, vector_result_sb, terms_degree, bsk, ksk,
-      mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
+  auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
+  while (r > chunk_size) {
+    int cur_total_blocks = r * num_blocks;
+    ch_amount = r / chunk_size;
+    dim3 add_grid(ch_amount, num_blocks, 1);
+    size_t sm_size = big_lwe_size * sizeof(Torus);
+    cuda_memset_async(new_blocks, 0,
+                      ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
+                      stream);
+
+    tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
+        new_blocks, old_blocks, chunk_size, num_blocks);
+
+    for (int c_id = 0; c_id < ch_amount; c_id++) {
+      auto cur_chunk = &terms_degree[c_id * chunk_size * num_blocks];
+      int mx = 0;
+      int mn = num_blocks;
+      for (int r_id = 1; r_id < chunk_size; r_id++) {
+        auto cur_radix = &cur_chunk[r_id * num_blocks];
+        for (int i = 0; i < num_blocks; i++) {
+          if (cur_radix[i]) {
+            mn = min(mn, i);
+            mx = max(mx, i);
+          }
+        }
+      }
+      f_b[c_id] = mn;
+      l_b[c_id] = mx;
+    }
+
+    int total_copied = 0;
+    int message_count = 0;
+    int carry_count = 0;
+    compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
+                                          l_b, num_blocks, ch_amount,
+                                          big_lwe_size, total_copied, true);
+
+    message_count = total_copied;
+    compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
+                                          l_b, num_blocks, ch_amount,
+                                          big_lwe_size, total_copied, false);
+    carry_count = total_copied - message_count;
+
+    auto message_blocks_vector = old_blocks;
+    auto carry_blocks_vector =
+        &old_blocks[message_count * (glwe_dimension * polynomial_size + 1)];
+
+    cuda_keyswitch_lwe_ciphertext_vector(
+        stream, small_lwe_vector, lwe_indexes, old_blocks, lwe_indexes, ksk,
+        polynomial_size * glwe_dimension, lwe_dimension,
+        mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
+
+    execute_pbs<Torus>(
+        stream, message_blocks_vector, lwe_indexes, luts_message->lut,
+        luts_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
+        luts_message->pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, mem_ptr->params.pbs_base_log,
+        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+        message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
+
+    execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
+                       luts_carry->lut, luts_carry->lut_indexes,
+                       &small_lwe_vector[message_count * (lwe_dimension + 1)],
+                       lwe_indexes, bsk, luts_carry->pbs_buffer,
+                       glwe_dimension, lwe_dimension, polynomial_size,
+                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+                       mem_ptr->params.grouping_factor, carry_count, 1, 0,
+                       max_shared_memory, mem_ptr->params.pbs_type);
+
+    int rem_blocks = r % chunk_size * num_blocks;
+    int new_blocks_created = 2 * ch_amount * num_blocks;
+    int copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
+
+    auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
+    auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
+    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
+
+    total_copied = 0;
+    int total_radix_copied = 0;
+    extract_message_carry_to_full_radix<Torus>(
+        stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
+        total_copied, total_radix_copied, num_blocks, true);
+    extract_message_carry_to_full_radix<Torus>(
+        stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
+        total_copied, total_radix_copied, num_blocks, false);
+
+    std::swap(new_blocks, old_blocks);
+    r = (new_blocks_created + rem_blocks) / num_blocks;
+  }
+
+  dim3 add_grid(1, num_blocks, 1);
+  size_t sm_size = big_lwe_size * sizeof(Torus);
+  cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
+                    stream);
+  tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
+      radix_lwe_out, old_blocks, r, num_blocks);
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
+      luts_message);
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
+      luts_carry);
+
+  cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
+
+  host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
+                big_lwe_size, num_blocks);
+
+  host_propagate_single_carry_low_latency<Torus>(
+      stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
 }

 template <typename Torus>
@@ -425,16 +474,166 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
    cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
    bool allocate_gpu_memory) {
-  cudaSetDevice(stream->gpu_index);
-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-
  *mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
                                       allocate_gpu_memory);
 }

+// Function to apply lookup table,
+// It has two mode
+//  lsb_msb_mode == true - extracts lsb and msb
+//  lsb_msb_mode == false - extracts message and carry
+template <typename Torus, typename STorus, class params>
+void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
+                        int_mul_memory<Torus> *mem_ptr, uint32_t glwe_dimension,
+                        uint32_t lwe_dimension, uint32_t polynomial_size,
+                        uint32_t pbs_base_log, uint32_t pbs_level,
+                        uint32_t ks_base_log, uint32_t ks_level,
+                        uint32_t grouping_factor,
+                        uint32_t lsb_message_blocks_count,
+                        uint32_t msb_carry_blocks_count,
+                        uint32_t max_shared_memory, bool lsb_msb_mode) {
+
+  int total_blocks_count = lsb_message_blocks_count + msb_carry_blocks_count;
+  int gpu_n = mem_ptr->p2p_gpu_count;
+  if (total_blocks_count < gpu_n)
+    gpu_n = total_blocks_count;
+  int gpu_blocks_count = total_blocks_count / gpu_n;
+  int big_lwe_size = glwe_dimension * polynomial_size + 1;
+  //  int small_lwe_size = lwe_dimension + 1;
+
+#pragma omp parallel for num_threads(gpu_n)
+  for (int i = 0; i < gpu_n; i++) {
+    cudaSetDevice(i);
+    auto this_stream = mem_ptr->streams[i];
+    // Index where input and output blocks start for current gpu
+    int big_lwe_start_index = i * gpu_blocks_count * big_lwe_size;
+
+    // Last gpu might have extra blocks to process if total blocks number is not
+    // divisible by gpu_n
+    if (i == gpu_n - 1) {
+      gpu_blocks_count += total_blocks_count % gpu_n;
+    }
+
+    int can_access_peer;
+    cudaDeviceCanAccessPeer(&can_access_peer, i, 0);
+    if (i == 0) {
+      check_cuda_error(
+          cudaMemcpyAsync(mem_ptr->pbs_output_multi_gpu[i],
+                          &input_ciphertexts[big_lwe_start_index],
+                          gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                          cudaMemcpyDeviceToDevice, *this_stream));
+    } else if (can_access_peer) {
+      check_cuda_error(cudaMemcpyPeerAsync(
+          mem_ptr->pbs_output_multi_gpu[i], i,
+          &input_ciphertexts[big_lwe_start_index], 0,
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
+    } else {
+      // Uses host memory as middle ground
+      cuda_memcpy_async_to_cpu(mem_ptr->device_to_device_buffer[i],
+                               &input_ciphertexts[big_lwe_start_index],
+                               gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                               this_stream, i);
+      cuda_memcpy_async_to_gpu(
+          mem_ptr->pbs_output_multi_gpu[i], mem_ptr->device_to_device_buffer[i],
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
+    }
+
+    // when lsb and msb have to be extracted
+    //  for first lsb_count blocks we need lsb_acc
+    //  for last msb_count blocks we need msb_acc
+    // when message and carry have tobe extracted
+    //  for first message_count blocks we need message_acc
+    //  for last carry_count blocks we need carry_acc
+    Torus *cur_lut_indexes;
+    if (lsb_msb_mode) {
+      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
+                    ? mem_ptr->lut_indexes_lsb_multi_gpu[i]
+                    : mem_ptr->lut_indexes_msb_multi_gpu[i];
+
+    } else {
+      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
+                    ? mem_ptr->lut_indexes_message_multi_gpu[i]
+                    : mem_ptr->lut_indexes_carry_multi_gpu[i];
+    }
+
+    // execute keyswitch on a current gpu with corresponding input and output
+    // blocks pbs_output_multi_gpu[i] is an input for keyswitch and
+    // pbs_input_multi_gpu[i] is an output for keyswitch
+    cuda_keyswitch_lwe_ciphertext_vector(
+        this_stream, i, mem_ptr->pbs_input_multi_gpu[i],
+        mem_ptr->pbs_output_multi_gpu[i], mem_ptr->ksk_multi_gpu[i],
+        polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
+        gpu_blocks_count);
+
+    // execute pbs on a current gpu with corresponding input and output
+    cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+        this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
+        mem_ptr->lut_multi_gpu[i], cur_lut_indexes,
+        mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
+        mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
+        polynomial_size, grouping_factor, pbs_base_log, pbs_level,
+        grouping_factor, gpu_blocks_count, 2, 0, max_shared_memory);
+
+    // lookup table is applied and now data from current gpu have to be copied
+    // back to gpu_0 in 'output_ciphertexts' buffer
+    if (i == 0) {
+      check_cuda_error(
+          cudaMemcpyAsync(&output_ciphertexts[big_lwe_start_index],
+                          mem_ptr->pbs_output_multi_gpu[i],
+                          gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                          cudaMemcpyDeviceToDevice, *this_stream));
+    } else if (can_access_peer) {
+      check_cuda_error(cudaMemcpyPeerAsync(
+          &output_ciphertexts[big_lwe_start_index], 0,
+          mem_ptr->pbs_output_multi_gpu[i], i,
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
+    } else {
+      // Uses host memory as middle ground
+      cuda_memcpy_async_to_cpu(
+          mem_ptr->device_to_device_buffer[i], mem_ptr->pbs_output_multi_gpu[i],
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
+      cuda_memcpy_async_to_gpu(&output_ciphertexts[big_lwe_start_index],
+                               mem_ptr->device_to_device_buffer[i],
+                               gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                               this_stream, i);
+    }
+  }
+}
+
+template <typename T>
+__global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
+                                                         T *input_lwe_array,
+                                                         T scalar,
+                                                         uint32_t lwe_dimension,
+                                                         uint32_t num_blocks) {
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int lwe_size = lwe_dimension + 1;
+  if (index < num_blocks * lwe_size) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output_lwe_array[index] = input_lwe_array[index] * scalar;
+  }
+}
+
+template <typename T>
+__host__ void host_integer_small_scalar_mult_radix(
+    cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
+    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
+      output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
+      input_lwe_ciphertext_count);
+  check_cuda_error(cudaGetLastError());
+}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -10,91 +10,3 @@ void cuda_negate_integer_radix_ciphertext_64_inplace(
                              lwe_ciphertext_count, message_modulus,
                              carry_modulus);
 }
-
-void scratch_cuda_integer_radix_overflowing_sub_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_overflowing_sub_kb<uint64_t>(
-      stream, (int_overflowing_sub_memory<uint64_t> **)mem_ptr, num_blocks,
-      params, allocate_gpu_memory);
-}
-
-void cuda_integer_radix_overflowing_sub_kb_64(
-    cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_overflowed,
-    void *radix_lwe_left, void *radix_lwe_right, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t num_blocks) {
-
-  auto mem = (int_overflowing_sub_memory<uint64_t> *)mem_ptr;
-
-  switch (mem->params.polynomial_size) {
-  case 512:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
-    break;
-  case 1024:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
-    break;
-  case 2048:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
-    break;
-  case 4096:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
-    break;
-  case 8192:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
-    break;
-  case 16384:
-    host_integer_overflowing_sub_kb<uint64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_overflowed),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), mem, num_blocks);
-    break;
-  default:
-    PANIC("Cuda error (integer overflowing sub): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
-  }
-}
-
-void cleanup_cuda_integer_radix_overflowing_sub(cuda_stream_t *stream,
-                                                int8_t **mem_ptr_void) {
-  int_overflowing_sub_memory<uint64_t> *mem_ptr =
-      (int_overflowing_sub_memory<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(stream);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -6,20 +6,9 @@
 #include <cuda_runtime.h>
 #endif

-#include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.h"
-#include "integer/integer.cuh"
-#include "linear_algebra.h"
-#include "programmable_bootstrap.h"
-#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
-#include <fstream>
-#include <iostream>
-#include <omp.h>
-#include <sstream>
-#include <string>
-#include <vector>

 template <typename Torus>
 __global__ void
@@ -87,32 +76,4 @@ __host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
  check_cuda_error(cudaGetLastError());
 }

-template <typename Torus>
-__host__ void scratch_cuda_integer_overflowing_sub_kb(
-    cuda_stream_t *stream, int_overflowing_sub_memory<Torus> **mem_ptr,
-    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
-
-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_overflowing_sub_memory<Torus>(stream, params, num_blocks,
-                                                   allocate_gpu_memory);
-}
-
-template <typename Torus, class params>
-__host__ void host_integer_overflowing_sub_kb(
-    cuda_stream_t *stream, Torus *radix_lwe_out, Torus *radix_lwe_overflowed,
-    Torus *radix_lwe_left, Torus *radix_lwe_right, void *bsk, uint64_t *ksk,
-    int_overflowing_sub_memory<uint64_t> *mem_ptr, uint32_t num_blocks) {
-
-  auto radix_params = mem_ptr->params;
-
-  host_unchecked_sub_with_correcting_term(
-      stream, radix_lwe_out, radix_lwe_left, radix_lwe_right,
-      radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
-      radix_params.carry_modulus, radix_params.message_modulus - 1);
-
-  host_propagate_single_sub_borrow<Torus>(
-      stream, radix_lwe_overflowed, radix_lwe_out, mem_ptr->borrow_prop_mem,
-      bsk, ksk, num_blocks);
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -11,7 +11,6 @@ __host__ void host_integer_radix_scalar_bitop_kb(
    int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
    uint32_t num_radix_blocks, BITOP_TYPE op) {

-  cudaSetDevice(stream->gpu_index);
  auto lut = mem_ptr->lut;
  auto params = lut->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
@@ -20,6 +19,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(

  if (num_clear_blocks == 0) {
    if (op == SCALAR_BITAND) {
+      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
      cuda_memset_async(lwe_array_out, 0,
                        num_radix_blocks * lwe_size * sizeof(Torus), stream);
    } else {
@@ -28,6 +28,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
                                   stream);
    }
  } else {
+    auto lut_buffer = lut->lut;
    // We have all possible LUTs pre-computed and we use the decomposed scalar
    // as index to recover the right one
    cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
@@ -37,7 +38,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
        stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
        lut);

-    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
+    if (op == SCALAR_BITAND) {
      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
      cuda_memset_async(lwe_array_out_block, 0,
                        (num_radix_blocks - num_clear_blocks) * lwe_size *
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -8,14 +8,17 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;
  switch (buffer->op) {
-  case EQ:
-  case NE:
-    host_integer_radix_scalar_equality_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
-        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
-    break;
+    //  case EQ:
+    //  case NE:
+    //    host_integer_radix_equality_check_kb<uint64_t>(
+    //        stream, static_cast<uint64_t *>(lwe_array_out),
+    //        static_cast<uint64_t *>(lwe_array_1),
+    //        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
+    //        static_cast<uint64_t *>(ksk), glwe_dimension, polynomial_size,
+    //        big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log,
+    //        pbs_level, pbs_base_log, grouping_factor, lwe_ciphertext_count,
+    //        message_modulus, carry_modulus);
+    //    break;
  case GT:
  case GE:
  case LT:
@@ -36,6 +39,6 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
    break;
  default:
-    PANIC("Cuda error: integer operation not supported")
+    printf("Not implemented\n");
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -5,13 +5,12 @@
 #include <omp.h>

 template <typename Torus>
-__host__ void integer_radix_unsigned_scalar_difference_check_kb(
+__host__ void host_integer_radix_scalar_difference_check_kb(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;
  auto big_lwe_dimension = params.big_lwe_dimension;
  auto glwe_dimension = params.glwe_dimension;
@@ -22,6 +21,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
  auto diff_buffer = mem_ptr->diff_buffer;

  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

  // Reducing the signs is the bottleneck of the comparison algorithms,
  // however if the scalar case there is an improvement:
@@ -46,9 +46,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
  if (total_num_scalar_blocks == 0) {
    // We only have to compare blocks with zero
    // means scalar is zero
-    host_compare_with_zero_equality(
-        stream, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsk, ksk,
-        total_num_radix_blocks, mem_ptr->is_zero_lut);
+    host_compare_with_zero_equality(stream, mem_ptr->tmp_lwe_array_out,
+                                    lwe_array_in, mem_ptr, bsk, ksk,
+                                    total_num_radix_blocks);

    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -64,6 +64,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);

+    // The result will be in the two first block. Everything else is
+    //  garbage.
+    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                      big_lwe_size_bytes * (total_num_radix_blocks - 1),
+                      stream);
+
  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
    // We have to handle both part of the work described above

@@ -71,14 +77,15 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    uint32_t num_msb_radix_blocks =
        total_num_radix_blocks - num_lsb_radix_blocks;

+    auto lsb = lwe_array_in;
    auto msb = lwe_array_in + num_lsb_radix_blocks * big_lwe_size;

    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;

    cuda_synchronize_stream(stream);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
+    auto lsb_stream = diff_buffer->lsb_stream;
+    auto msb_stream = diff_buffer->msb_stream;

 #pragma omp parallel sections
    {
@@ -113,7 +120,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        // final sign
        tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
                            mem_ptr->diff_buffer->tree_buffer,
-                            mem_ptr->identity_lut_f, bsk, ksk,
+                            mem_ptr->cleaning_lut_f, bsk, ksk,
                            num_lsb_radix_blocks);
      }
 #pragma omp section
@@ -121,8 +128,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        //////////////
        // msb
        host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        mem_ptr->is_zero_lut);
+                                        mem_ptr, bsk, ksk,
+                                        num_msb_radix_blocks);
      }
    }
    cuda_synchronize_stream(lsb_stream);
@@ -148,6 +155,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
        1, lut);

+    // The result will be in the first block. Everything else is garbage.
+    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
+                      stream);
  } else {
    // We only have to do the regular comparison
    // And not the part where we compare most significant blocks with zeros
@@ -155,6 +166,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
    uint32_t num_scalar_blocks = total_num_scalar_blocks;

+    auto lsb = lwe_array_in;
+
    Torus *lhs = diff_buffer->tmp_packed_left;
    Torus *rhs = diff_buffer->tmp_packed_right;

@@ -181,344 +194,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    tree_sign_reduction(stream, lwe_array_out, comparisons,
                        mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
                        ksk, num_lsb_radix_blocks);
-  }
-}

-template <typename Torus>
-__host__ void integer_radix_signed_scalar_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
-    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
-
-  cudaSetDevice(stream->gpu_index);
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  auto diff_buffer = mem_ptr->diff_buffer;
-
-  size_t big_lwe_size = big_lwe_dimension + 1;
-
-  // Reducing the signs is the bottleneck of the comparison algorithms,
-  // however if the scalar case there is an improvement:
-  //
-  // The idea is to reduce the number of signs block we have to
-  // reduce. We can do that by splitting the comparison problem in two parts.
-  //
-  // - One part where we compute the signs block between the scalar with just
-  // enough blocks
-  //   from the ciphertext that can represent the scalar value
-  //
-  // - The other part is to compare the ciphertext blocks not considered for the
-  // sign
-  //   computation with zero, and create a single sign block from that.
-  //
-  // The smaller the scalar value is compared to the ciphertext num bits
-  // encrypted, the more the comparisons with zeros we have to do, and the less
-  // signs block we will have to reduce.
-  //
-  // This will create a speedup as comparing a bunch of blocks with 0
-  // is faster
-  if (total_num_scalar_blocks == 0) {
-    // We only have to compare blocks with zero
-    // means scalar is zero
-    Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
-    host_compare_with_zero_equality(stream, are_all_msb_zeros, lwe_array_in,
-                                    mem_ptr, bsk, ksk, total_num_radix_blocks,
-                                    mem_ptr->is_zero_lut);
-    Torus *sign_block =
-        lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
-
-    auto sign_bit_pos = (int)std::log2(message_modulus) - 1;
-
-    auto scalar_last_leaf_with_respect_to_zero_lut_f =
-        [sign_handler_f, sign_bit_pos,
-         message_modulus](Torus sign_block) -> Torus {
-      sign_block %= message_modulus;
-      int sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
-      CMP_ORDERING sign_block_ordering;
-      if (sign_bit_is_set) {
-        sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
-      } else if (sign_block != 0) {
-        sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
-      } else {
-        sign_block_ordering = CMP_ORDERING::IS_EQUAL;
-      }
-
-      return sign_block_ordering;
-    };
-
-    auto block_selector_f = mem_ptr->diff_buffer->tree_buffer->block_selector_f;
-    auto scalar_bivariate_last_leaf_lut_f =
-        [scalar_last_leaf_with_respect_to_zero_lut_f, sign_handler_f,
-         block_selector_f](Torus are_all_zeros, Torus sign_block) -> Torus {
-      // "re-code" are_all_zeros as an ordering value
-      if (are_all_zeros == 1) {
-        are_all_zeros = CMP_ORDERING::IS_EQUAL;
-      } else {
-        are_all_zeros = CMP_ORDERING::IS_SUPERIOR;
-      };
-
-      return sign_handler_f(block_selector_f(
-          scalar_last_leaf_with_respect_to_zero_lut_f(sign_block),
-          are_all_zeros));
-    };
-
-    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator_bivariate<Torus>(
-        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-        carry_modulus, scalar_bivariate_last_leaf_lut_f);
-
-    integer_radix_apply_bivariate_lookup_table_kb(
-        stream, lwe_array_out, are_all_msb_zeros, sign_block, bsk, ksk, 1, lut);
-
-  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
-    // We have to handle both part of the work described above
-    // And the sign bit is located in the most_significant_blocks
-
-    uint32_t num_lsb_radix_blocks = total_num_scalar_blocks;
-    uint32_t num_msb_radix_blocks =
-        total_num_radix_blocks - num_lsb_radix_blocks;
-    auto msb = lwe_array_in + num_lsb_radix_blocks * big_lwe_size;
-
-    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
-    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
-
-    cuda_synchronize_stream(stream);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
-
-#pragma omp parallel sections
-    {
-      // Both sections may be executed in parallel
-#pragma omp section
-      {
-        //////////////
-        // lsb
-        Torus *lhs = diff_buffer->tmp_packed_left;
-        Torus *rhs = diff_buffer->tmp_packed_right;
-
-        pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
-                    num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
-                    message_modulus);
-
-        // From this point we have half number of blocks
-        num_lsb_radix_blocks /= 2;
-        num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
-
-        // comparisons will be assigned
-        // - 0 if lhs < rhs
-        // - 1 if lhs == rhs
-        // - 2 if lhs > rhs
-
-        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
-                                       mem_ptr, bsk, ksk, num_lsb_radix_blocks);
-
-        // Reduces a vec containing radix blocks that encrypts a sign
-        // (inferior, equal, superior) to one single radix block containing the
-        // final sign
-        tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
-                            mem_ptr->diff_buffer->tree_buffer,
-                            mem_ptr->identity_lut_f, bsk, ksk,
-                            num_lsb_radix_blocks);
-      }
-#pragma omp section
-      {
-        //////////////
-        // msb
-        // We remove the last block (which is the sign)
-        Torus *are_all_msb_zeros = lwe_array_msb_out;
-        host_compare_with_zero_equality(msb_stream, are_all_msb_zeros, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        mem_ptr->is_zero_lut);
-
-        auto sign_bit_pos = (int)log2(message_modulus) - 1;
-
-        auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block,
-                                             Torus msb_are_zeros) {
-          bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1;
-          CMP_ORDERING sign_block_ordering;
-          if (sign_bit_is_set) {
-            sign_block_ordering = CMP_ORDERING::IS_INFERIOR;
-          } else if (sign_block != 0) {
-            sign_block_ordering = CMP_ORDERING::IS_SUPERIOR;
-          } else {
-            sign_block_ordering = CMP_ORDERING::IS_EQUAL;
-          }
-
-          CMP_ORDERING msb_ordering;
-          if (msb_are_zeros == 1)
-            msb_ordering = CMP_ORDERING::IS_EQUAL;
-          else
-            msb_ordering = CMP_ORDERING::IS_SUPERIOR;
-
-          return mem_ptr->diff_buffer->tree_buffer->block_selector_f(
-              sign_block_ordering, msb_ordering);
-        };
-
-        auto signed_msb_lut = mem_ptr->signed_msb_lut;
-        generate_device_accumulator_bivariate<Torus>(
-            msb_stream, signed_msb_lut->lut, params.glwe_dimension,
-            params.polynomial_size, params.message_modulus,
-            params.carry_modulus, lut_f);
-
-        Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-        integer_radix_apply_bivariate_lookup_table_kb(
-            msb_stream, lwe_array_msb_out, sign_block, are_all_msb_zeros, bsk,
-            ksk, 1, signed_msb_lut);
-      }
-    }
-    cuda_synchronize_stream(lsb_stream);
-    cuda_synchronize_stream(msb_stream);
-
-    //////////////
-    // Reduce the two blocks into one final
-    reduce_signs(stream, lwe_array_out, lwe_array_lsb_out, mem_ptr,
-                 sign_handler_f, bsk, ksk, 2);
-
-  } else {
-    // We only have to do the regular comparison
-    // And not the part where we compare most significant blocks with zeros
-    // total_num_radix_blocks == total_num_scalar_blocks
-    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
-
-    cuda_synchronize_stream(stream);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
-
-    auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
-    auto lwe_array_sign_out =
-        lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size;
-#pragma omp parallel sections
-    {
-      // Both sections may be executed in parallel
-#pragma omp section
-      {
-        Torus *lhs = diff_buffer->tmp_packed_left;
-        Torus *rhs = diff_buffer->tmp_packed_right;
-
-        pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
-                    num_lsb_radix_blocks - 1, message_modulus);
-        pack_blocks(lsb_stream, rhs, scalar_blocks, 0, num_lsb_radix_blocks - 1,
-                    message_modulus);
-
-        // From this point we have half number of blocks
-        num_lsb_radix_blocks /= 2;
-
-        // comparisons will be assigned
-        // - 0 if lhs < rhs
-        // - 1 if lhs == rhs
-        // - 2 if lhs > rhs
-        scalar_compare_radix_blocks_kb(lsb_stream, lwe_array_ct_out, lhs, rhs,
-                                       mem_ptr, bsk, ksk, num_lsb_radix_blocks);
-      }
-#pragma omp section
-      {
-        Torus *encrypted_sign_block =
-            lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
-        Torus *scalar_sign_block =
-            scalar_blocks + (total_num_scalar_blocks - 1);
-
-        auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
-        create_trivial_radix(msb_stream, trivial_sign_block, scalar_sign_block,
-                             big_lwe_dimension, 1, 1, message_modulus,
-                             carry_modulus);
-
-        integer_radix_apply_bivariate_lookup_table_kb(
-            msb_stream, lwe_array_sign_out, encrypted_sign_block,
-            trivial_sign_block, bsk, ksk, 1, mem_ptr->signed_lut);
-      }
-    }
-    cuda_synchronize_stream(lsb_stream);
-    cuda_synchronize_stream(msb_stream);
-
-    // Reduces a vec containing radix blocks that encrypts a sign
-    // (inferior, equal, superior) to one single radix block containing the
-    // final sign
-    reduce_signs(stream, lwe_array_out, lwe_array_ct_out, mem_ptr,
-                 sign_handler_f, bsk, ksk, num_lsb_radix_blocks + 1);
-  }
-}
-
-template <typename Torus>
-__host__ void integer_radix_signed_scalar_maxmin_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t total_num_radix_blocks,
-    uint32_t total_num_scalar_blocks) {
-
-  cudaSetDevice(stream->gpu_index);
-  auto params = mem_ptr->params;
-  // Calculates the difference sign between the ciphertext and the scalar
-  // - 0 if lhs < rhs
-  // - 1 if lhs == rhs
-  // - 2 if lhs > rhs
-  auto sign = mem_ptr->tmp_lwe_array_out;
-  integer_radix_signed_scalar_difference_check_kb(
-      stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
-      mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
-      total_num_scalar_blocks);
-
-  // There is no optimized CMUX for scalars, so we convert to a trivial
-  // ciphertext
-  auto lwe_array_left = lwe_array_in;
-  auto lwe_array_right = mem_ptr->tmp_block_comparisons;
-
-  create_trivial_radix(stream, lwe_array_right, scalar_blocks,
-                       params.big_lwe_dimension, total_num_radix_blocks,
-                       total_num_scalar_blocks, params.message_modulus,
-                       params.carry_modulus);
-
-  // Selector
-  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(stream, lwe_array_out, sign, lwe_array_left,
-                             lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk,
-                             total_num_radix_blocks);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
-    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
-
-  if (mem_ptr->is_signed) {
-    // is signed and scalar is positive
-    integer_radix_signed_scalar_difference_check_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
-        sign_handler_f, bsk, ksk, total_num_radix_blocks,
-        total_num_scalar_blocks);
-  } else {
-    integer_radix_unsigned_scalar_difference_check_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr,
-        sign_handler_f, bsk, ksk, total_num_radix_blocks,
-        total_num_scalar_blocks);
-  }
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_signed_scalar_maxmin_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t total_num_radix_blocks,
-    uint32_t total_num_scalar_blocks) {
-
-  if (mem_ptr->is_signed) {
-    // is signed and scalar is positive
-    integer_radix_signed_scalar_maxmin_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
-        total_num_radix_blocks, total_num_scalar_blocks);
-  } else {
-    integer_radix_unsigned_scalar_maxmin_kb(
-        stream, lwe_array_out, lwe_array_in, scalar_blocks, mem_ptr, bsk, ksk,
-        total_num_radix_blocks, total_num_scalar_blocks);
+    // The result will be in the first block. Everything else is garbage.
+    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
+                      stream);
  }
 }

@@ -529,9 +209,17 @@ scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
                               Torus *ksk, uint32_t num_radix_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;
+  auto pbs_type = params.pbs_type;
  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
  auto message_modulus = params.message_modulus;
  auto carry_modulus = params.carry_modulus;

@@ -579,7 +267,6 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
    Torus *ksk, uint32_t total_num_radix_blocks,
    uint32_t total_num_scalar_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem_ptr->params;

  // Calculates the difference sign between the ciphertext and the scalar
@@ -589,7 +276,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  auto sign = mem_ptr->tmp_lwe_array_out;
  host_integer_radix_scalar_difference_check_kb(
      stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
-      mem_ptr->identity_lut_f, bsk, ksk, total_num_radix_blocks,
+      mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks,
      total_num_scalar_blocks);

  // There is no optimized CMUX for scalars, so we convert to a trivial
@@ -608,108 +295,4 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
 }
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_equality_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-
-  auto eq_buffer = mem_ptr->eq_buffer;
-
-  size_t big_lwe_size = big_lwe_dimension + 1;
-
-  auto scalar_comparison_luts = eq_buffer->scalar_comparison_luts;
-
-  uint32_t num_halved_scalar_blocks =
-      (num_scalar_blocks / 2) + (num_scalar_blocks % 2);
-
-  uint32_t num_lsb_radix_blocks =
-      std::min(num_radix_blocks, 2 * num_halved_scalar_blocks);
-  uint32_t num_msb_radix_blocks = num_radix_blocks - num_lsb_radix_blocks;
-  uint32_t num_halved_lsb_radix_blocks =
-      (num_lsb_radix_blocks / 2) + (num_lsb_radix_blocks % 2);
-
-  auto lsb = lwe_array_in;
-  auto msb = lwe_array_in + big_lwe_size * num_lsb_radix_blocks;
-
-  auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
-  auto lwe_array_msb_out =
-      lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;
-
-  cuda_synchronize_stream(stream);
-
-  auto lsb_stream = mem_ptr->lsb_stream;
-  auto msb_stream = mem_ptr->msb_stream;
-
-#pragma omp parallel sections
-  {
-    // Both sections may be executed in parallel
-#pragma omp section
-    {
-      if (num_halved_scalar_blocks > 0) {
-        auto packed_blocks = mem_ptr->tmp_packed_input;
-        auto packed_scalar =
-            packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
-
-        pack_blocks(lsb_stream, packed_blocks, lsb, big_lwe_dimension,
-                    num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, packed_scalar, scalar_blocks, 0,
-                    num_scalar_blocks, message_modulus);
-
-        cuda_memcpy_async_gpu_to_gpu(
-            scalar_comparison_luts->lut_indexes, packed_scalar,
-            num_halved_scalar_blocks * sizeof(Torus), lsb_stream);
-
-        integer_radix_apply_univariate_lookup_table_kb(
-            lsb_stream, lwe_array_lsb_out, packed_blocks, bsk, ksk,
-            num_halved_lsb_radix_blocks, scalar_comparison_luts);
-      }
-    }
-#pragma omp section
-    {
-      //////////////
-      // msb
-      if (num_msb_radix_blocks > 0) {
-        int_radix_lut<Torus> *msb_lut;
-        switch (mem_ptr->op) {
-        case COMPARISON_TYPE::EQ:
-          msb_lut = mem_ptr->is_zero_lut;
-          break;
-        case COMPARISON_TYPE::NE:
-          msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-          break;
-        default:
-          PANIC("Cuda error: integer operation not supported")
-        }
-
-        host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        msb_lut);
-      }
-    }
-  }
-
-  cuda_synchronize_stream(lsb_stream);
-  cuda_synchronize_stream(msb_stream);
-
-  switch (mem_ptr->op) {
-  case COMPARISON_TYPE::EQ:
-    are_all_comparisons_block_true(
-        stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
-        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
-    break;
-  case COMPARISON_TYPE::NE:
-    is_at_least_one_comparisons_block_true(
-        stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
-        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
-    break;
-  default:
-    PANIC("Cuda error: integer operation not supported")
-  }
-}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -1,89 +0,0 @@
-#include "integer/scalar_mul.cuh"
-
-void scratch_cuda_integer_scalar_mul_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus);
-
-  scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
-      stream, (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
-      allocate_gpu_memory);
-}
-
-void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint64_t *decomposed_scalar,
-    uint64_t *has_at_least_one_set, int8_t *mem, void *bsk, void *ksk,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
-    uint32_t num_blocks, uint32_t num_scalars) {
-
-  switch (polynomial_size) {
-  case 512:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
-    break;
-  case 1024:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
-    break;
-  case 2048:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
-    break;
-  case 4096:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
-    break;
-  case 8192:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
-    break;
-  case 16384:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(lwe_array), decomposed_scalar,
-        has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsk,
-        static_cast<uint64_t *>(ksk), lwe_dimension, message_modulus,
-        num_blocks, num_scalars);
-    break;
-  default:
-    PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
-  }
-}
-
-void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
-                                           int8_t **mem_ptr_void) {
-
-  cudaSetDevice(stream->gpu_index);
-  int_scalar_mul_buffer<uint64_t> *mem_ptr =
-      (int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(stream);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -1,136 +0,0 @@
-#ifndef CUDA_INTEGER_SCALAR_MUL_CUH
-#define CUDA_INTEGER_SCALAR_MUL_CUH
-
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#include "device.h"
-#include "integer.h"
-#include "multiplication.cuh"
-#include "scalar_shifts.cuh"
-#include "utils/kernel_dimensions.cuh"
-#include <stdio.h>
-
-template <typename T>
-__global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
-                                                         T *input_lwe_array,
-                                                         T scalar,
-                                                         uint32_t lwe_dimension,
-                                                         uint32_t num_blocks) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int lwe_size = lwe_dimension + 1;
-  if (index < num_blocks * lwe_size) {
-    // Here we take advantage of the wrapping behaviour of uint
-    output_lwe_array[index] = input_lwe_array[index] * scalar;
-  }
-}
-
-template <typename T>
-__host__ void scratch_cuda_integer_radix_scalar_mul_kb(
-    cuda_stream_t *stream, int_scalar_mul_buffer<T> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
-
-  cudaSetDevice(stream->gpu_index);
-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-
-  *mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
-                                          allocate_gpu_memory);
-}
-
-template <typename T, class params>
-__host__ void host_integer_scalar_mul_radix(
-    cuda_stream_t *stream, T *lwe_array, T *decomposed_scalar,
-    T *has_at_least_one_set, int_scalar_mul_buffer<T> *mem, void *bsk, T *ksk,
-    uint32_t input_lwe_dimension, uint32_t message_modulus,
-    uint32_t num_radix_blocks, uint32_t num_scalars) {
-
-  if (num_radix_blocks == 0 | num_scalars == 0)
-    return;
-
-  cudaSetDevice(stream->gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  uint32_t lwe_size = input_lwe_dimension + 1;
-  uint32_t lwe_size_bytes = lwe_size * sizeof(T);
-  uint32_t msg_bits = (uint32_t)std::log2(message_modulus);
-  uint32_t num_ciphertext_bits = msg_bits * num_radix_blocks;
-
-  T *preshifted_buffer = mem->preshifted_buffer;
-  T *all_shifted_buffer = mem->all_shifted_buffer;
-
-  for (size_t shift_amount = 0; shift_amount < msg_bits; shift_amount++) {
-    T *ptr = preshifted_buffer + shift_amount * lwe_size * num_radix_blocks;
-    if (has_at_least_one_set[shift_amount] == 1) {
-      cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
-                                   lwe_size_bytes * num_radix_blocks, stream);
-      host_integer_radix_logical_scalar_shift_kb_inplace(
-          stream, ptr, shift_amount, mem->logical_scalar_shift_buffer, bsk, ksk,
-          num_radix_blocks);
-    } else {
-      // create trivial assign for value = 0
-      cuda_memset_async(ptr, 0, num_radix_blocks * lwe_size_bytes, stream);
-    }
-  }
-  size_t j = 0;
-  for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) {
-    if (decomposed_scalar[i] == 1) {
-      // Perform a block shift
-      T *preshifted_radix_ct =
-          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
-      T *block_shift_buffer =
-          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      radix_blocks_rotate_right<<<num_radix_blocks, 256, 0, stream->stream>>>(
-          block_shift_buffer, preshifted_radix_ct, i / msg_bits,
-          num_radix_blocks, lwe_size);
-      // create trivial assign for value = 0
-      cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
-                        stream);
-      j++;
-    }
-  }
-
-  if (j == 0) {
-    // lwe array = 0
-    cuda_memset_async(lwe_array, 0, num_radix_blocks * lwe_size_bytes, stream);
-  } else {
-    int terms_degree[j * num_radix_blocks];
-    for (int i = 0; i < j * num_radix_blocks; i++) {
-      terms_degree[i] = message_modulus - 1;
-    }
-    host_integer_sum_ciphertexts_vec_kb<T, params>(
-        stream, lwe_array, all_shifted_buffer, terms_degree, bsk, ksk,
-        mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
-  }
-}
-
-// Small scalar_mul is used in shift/rotate
-template <typename T>
-__host__ void host_integer_small_scalar_mul_radix(
-    cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
-
-  cudaSetDevice(stream->gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = input_lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
-      output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
-      input_lwe_ciphertext_count);
-  check_cuda_error(cudaGetLastError());
-}
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -6,8 +6,7 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory) {
+    PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
@@ -15,8 +14,8 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(
                          message_modulus, carry_modulus);

  scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
-      stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
-      params, shift_type, allocate_gpu_memory);
+      stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      shift_type, allocate_gpu_memory);
 }

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
@@ -27,15 +26,15 @@ void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,

  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
      stream, static_cast<uint64_t *>(lwe_array), n,
-      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      num_blocks);
 }

 void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
                                              int8_t **mem_ptr_void) {

-  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
-      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
+  int_shift_buffer<uint64_t> *mem_ptr =
+      (int_shift_buffer<uint64_t> *)(*mem_ptr_void);

  mem_ptr->release(stream);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -5,30 +5,40 @@
 #include "device.h"
 #include "integer.cuh"
 #include "integer.h"
-#include "pbs/programmable_bootstrap_classic.cuh"
-#include "pbs/programmable_bootstrap_multibit.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "types/complex/operations.cuh"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
+#define CUDA_INTEGER_SHIFT_OPS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.cuh"
+#include "integer.h"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
 #include "types/complex/operations.cuh"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"

 template <typename Torus>
 __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
-    cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
+    cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
+    bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
-      stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
+  *mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
+                                         num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
 __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
-    int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
-    uint32_t num_blocks) {
+    int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -99,4 +109,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  }
 }

+#endif // CUDA_SCALAR_OPS_CUH
+
 #endif // CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -1,90 +1,38 @@
 #include "scalar_shifts.cuh"

-void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
+void scratch_cuda_integer_radix_scalar_shift_kb_64(
    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory) {
+    PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
                          message_modulus, carry_modulus);

-  scratch_cuda_integer_radix_logical_scalar_shift_kb<uint64_t>(
-      stream, (int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
-      params, shift_type, allocate_gpu_memory);
+  scratch_cuda_integer_radix_scalar_shift_kb<uint64_t>(
+      stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      shift_type, allocate_gpu_memory);
 }

-/// The logical scalar shift is the one used for unsigned integers, and
-/// for the left scalar shift. It is constituted of a rotation, followed by
-/// the application of a PBS onto the rotated blocks up to num_blocks -
-/// rotations - 1 The remaining blocks are padded with zeros
-void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
+void cuda_integer_radix_scalar_shift_kb_64_inplace(
    cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
    void *bsk, void *ksk, uint32_t num_blocks) {

-  host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
+  host_integer_radix_scalar_shift_kb_inplace<uint64_t>(
      stream, static_cast<uint64_t *>(lwe_array), shift,
-      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
+      (int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      num_blocks);
 }

-void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory) {
+void cleanup_cuda_integer_radix_scalar_shift(cuda_stream_t *stream,
+                                             int8_t **mem_ptr_void) {

-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_radix_arithmetic_scalar_shift_kb<uint64_t>(
-      stream, (int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr,
-      num_blocks, params, shift_type, allocate_gpu_memory);
-}
-
-/// The arithmetic scalar shift is the one used for the signed right shift.
-/// It is constituted of a rotation, followed by
-/// the application of a PBS onto the rotated blocks up to num_blocks -
-/// rotations - 2 The last rotated block has another PBS applied, as it is the
-/// sign block, and a second PBS is also applied to it to compute the padding
-/// block, which is copied onto all remaining blocks instead of padding with
-/// zeros as would be done in the logical shift.
-void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
-    void *bsk, void *ksk, uint32_t num_blocks) {
-
-  host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array), shift,
-      (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
-}
-
-void cleanup_cuda_integer_radix_logical_scalar_shift(cuda_stream_t *stream,
-                                                     int8_t **mem_ptr_void) {
-
-  cudaSetDevice(stream->gpu_index);
-  int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
-      (int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(stream);
-}
-
-void cleanup_cuda_integer_radix_arithmetic_scalar_shift(cuda_stream_t *stream,
-                                                        int8_t **mem_ptr_void) {
-
-  cudaSetDevice(stream->gpu_index);
-  int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
-      (int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
+  int_shift_buffer<uint64_t> *mem_ptr =
+      (int_shift_buffer<uint64_t> *)(*mem_ptr_void);

  mem_ptr->release(stream);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -1,35 +1,31 @@
-#ifndef CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
-#define CUDA_INTEGER_SCALAR_SHIFT_OPS_CUH
+#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
+#define CUDA_INTEGER_SHIFT_OPS_CUH

 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer.cuh"
 #include "integer.h"
-#include "pbs/programmable_bootstrap_classic.cuh"
-#include "pbs/programmable_bootstrap_multibit.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
 #include "types/complex/operations.cuh"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
-#include <omp.h>

 template <typename Torus>
-__host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb(
-    cuda_stream_t *stream, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
+__host__ void scratch_cuda_integer_radix_scalar_shift_kb(
+    cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
+    bool allocate_gpu_memory) {

-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
-      stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
+  *mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
+                                         num_radix_blocks, allocate_gpu_memory);
 }

 template <typename Torus>
-__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
+__host__ void host_integer_radix_scalar_shift_kb_inplace(
    cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
-    int_logical_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
-    uint32_t num_blocks) {
+    int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {

-  cudaSetDevice(stream->gpu_index);
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
@@ -48,10 +44,10 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
  size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
  size_t shift_within_block = shift % num_bits_in_block;

-  Torus *full_rotated_buffer = mem->tmp_rotated;
-  Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size];
+  Torus *rotated_buffer = mem->tmp_rotated;

  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+  auto lut_univariate = mem->lut_buffers_univariate[shift_within_block];

  // rotate right all the blocks in radix ciphertext
  // copy result in new buffer
@@ -72,15 +68,23 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
      return;
    }

-    auto partial_current_blocks = &lwe_array[rotations * big_lwe_size];
-    auto partial_previous_blocks =
-        &full_rotated_buffer[rotations * big_lwe_size];
+    // check if we have enough blocks for partial processing
+    if (rotations < num_blocks - 1) {
+      auto partial_current_blocks = &lwe_array[(rotations + 1) * big_lwe_size];
+      auto partial_previous_blocks = &lwe_array[rotations * big_lwe_size];

-    size_t partial_block_count = num_blocks - rotations;
+      size_t partial_block_count = num_blocks - rotations - 1;

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, partial_current_blocks, partial_current_blocks,
-        partial_previous_blocks, bsk, ksk, partial_block_count, lut_bivariate);
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          stream, partial_current_blocks, partial_current_blocks,
+          partial_previous_blocks, bsk, ksk, partial_block_count,
+          lut_bivariate);
+    }
+
+    auto rest = &lwe_array[rotations * big_lwe_size];
+
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, rest, rest, bsk, ksk, 1, lut_univariate);

  } else {
    // right shift
@@ -98,138 +102,23 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
      return;
    }

-    auto partial_current_blocks = lwe_array;
-    auto partial_next_blocks = &rotated_buffer[big_lwe_size];
+    // check if we have enough blocks for partial processing
+    if (rotations < num_blocks - 1) {
+      auto partial_current_blocks = lwe_array;
+      auto partial_next_blocks = &lwe_array[big_lwe_size];

-    size_t partial_block_count = num_blocks - rotations;
+      size_t partial_block_count = num_blocks - rotations - 1;

-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, partial_current_blocks, partial_current_blocks,
-        partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
-  }
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
-    cuda_stream_t *stream, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
-
-  cudaSetDevice(stream->gpu_index);
-  *mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
-      stream, shift_type, params, num_radix_blocks, allocate_gpu_memory);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
-    int_arithmetic_scalar_shift_buffer<Torus> *mem, void *bsk, Torus *ksk,
-    uint32_t num_blocks) {
-
-  cudaSetDevice(stream->gpu_index);
-  auto params = mem->params;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-
-  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  size_t num_bits_in_block = (size_t)log2(message_modulus);
-  size_t total_num_bits = num_bits_in_block * num_blocks;
-  shift = shift % total_num_bits;
-
-  if (shift == 0) {
-    return;
-  }
-  size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
-  size_t shift_within_block = shift % num_bits_in_block;
-
-  Torus *rotated_buffer = mem->tmp_rotated;
-  Torus *padding_block = &rotated_buffer[num_blocks * big_lwe_size];
-  Torus *last_block_copy = &padding_block[big_lwe_size];
-
-  auto lut_univariate_shift_last_block =
-      mem->lut_buffers_univariate[shift_within_block - 1];
-  auto lut_univariate_padding_block =
-      mem->lut_buffers_univariate[num_bits_in_block - 1];
-  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
-
-  if (mem->shift_type == RIGHT_SHIFT) {
-    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
-        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
-    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
-                                 num_blocks * big_lwe_size_bytes, stream);
-
-    if (num_bits_in_block == 1) {
-      // if there is only 1 bit in the msg part, it means shift_within block is
-      // 0 thus only rotations is required.
-
-      // We still need to pad with the value of the sign bit.
-      // And here since a block only has 1 bit of message
-      // we can optimize things by not doing the pbs to extract this sign bit
-
-      Torus *block_src =
-          rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size;
-      Torus *block_dest =
-          rotated_buffer + (num_blocks - rotations) * big_lwe_size;
-      for (uint i = 0; i < num_blocks; i++) {
-        cuda_memcpy_async_gpu_to_gpu(block_dest, block_src, big_lwe_size_bytes,
-                                     stream);
-        block_dest += big_lwe_size;
-      }
-      return;
-    }
-
-    // In the arithmetic shift case we have to pad with the value of the sign
-    // bit. This creates the need for a different shifting lut than in the
-    // logical shift case. We also need another PBS to create the padding block.
-    Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
-    cuda_memcpy_async_gpu_to_gpu(last_block_copy,
-                                 rotated_buffer + (num_blocks - rotations - 1) *
-                                                      big_lwe_size,
-                                 big_lwe_size_bytes, stream);
-    auto partial_current_blocks = lwe_array;
-    auto partial_next_blocks = &rotated_buffer[big_lwe_size];
-    size_t partial_block_count = num_blocks - rotations;
-    if (shift_within_block != 0 && rotations != num_blocks) {
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          stream, partial_current_blocks, partial_current_blocks,
          partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
    }
-    // Since our CPU threads will be working on different streams we shall
-    // assert the work in the main stream is completed
-    stream->synchronize();
-#pragma omp parallel sections
-    {
-      // All sections may be executed in parallel
-#pragma omp section
-      {
-        integer_radix_apply_univariate_lookup_table_kb(
-            mem->local_stream_1, padding_block, last_block_copy, bsk, ksk, 1,
-            lut_univariate_padding_block);
-        // Replace blocks 'pulled' from the left with the correct padding block
-        for (uint i = 0; i < rotations; i++) {
-          cuda_memcpy_async_gpu_to_gpu(
-              lwe_array + (num_blocks - rotations + i) * big_lwe_size,
-              padding_block, big_lwe_size_bytes, mem->local_stream_1);
-        }
-      }
-#pragma omp section
-      {
-        if (shift_within_block != 0 && rotations != num_blocks) {
-          integer_radix_apply_univariate_lookup_table_kb(
-              mem->local_stream_2, last_block, last_block_copy, bsk, ksk, 1,
-              lut_univariate_shift_last_block);
-        }
-      }
-    }
-    cuda_synchronize_stream(mem->local_stream_1);
-    cuda_synchronize_stream(mem->local_stream_2);

-  } else {
-    PANIC("Cuda error (scalar shift): left scalar shift is never of the "
-          "arithmetic type")
+    // The right-most block is done separately as it does not
+    // need to recuperate the shifted bits from its right neighbour.
+    auto last_block = &lwe_array[(num_blocks - rotations - 1) * big_lwe_size];
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, last_block, last_block, bsk, ksk, 1, lut_univariate);
  }
 }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Yuxi Zhao	bddc35459d	GITBOOK-5: Update TOC	2024-03-05 14:32:28 +00:00
Yuxi Zhao	27c421b359	GITBOOK-4: V2 design details	2024-02-28 15:27:05 +00:00
Yuxi Zhao	2adeff44f3	GITBOOK-3: correct a typo	2024-02-28 14:54:38 +00:00
Yuxi Zhao	d0042aed54	GITBOOK-2: No subject	2024-02-28 14:23:50 +00:00
Yuxi Zhao	5eabdeab55	GITBOOK-1: Remove extra sentences	2024-02-28 14:11:06 +00:00
yuxizama	0152c212af	Update SUMMARY.md	2024-02-28 15:07:14 +01:00
yuxizama	9a2c4a3784	Rename what-is-tfhe-rs to what-is-tfhe-rs.md	2024-02-28 15:06:45 +01:00
yuxizama	c14aad5656	V2 resorting 6	2024-02-28 14:57:24 +01:00
yuxizama	702e0ef306	V2 resorting 5	2024-02-28 14:54:21 +01:00
yuxizama	515d2e009f	V2 resorting 4	2024-02-28 14:30:49 +01:00
yuxizama	711b5151dc	V2 resorting 3	2024-02-28 14:28:58 +01:00
yuxizama	ceaee2f910	V2 resorting 2	2024-02-28 14:28:18 +01:00
yuxizama	41015db7a1	V2 resorting 1	2024-02-28 14:27:56 +01:00
Yuxi Zhao	485b2a7693	GITBOOK-15: V2 change images and adjust wording	2024-02-28 13:13:02 +00:00
Yuxi Zhao	7d903d5f7a	GITBOOK-13: update v2	2024-02-27 16:01:36 +00:00
Yuxi Zhao	19ac6eb123	GITBOOK-1: New structure	2024-02-27 16:27:16 +01:00
tmontaigu	5b653864b7	chore(tfhe): bump version to 0.5.2	2024-02-23 10:21:47 +01:00
Arthur Meyre	a1d189b415	chore(ci): update macOS runner for cargo builds	2024-02-23 10:21:47 +01:00
sarah el kazdadi	c59434f183	chore(ci): update toolchain, fix clippy warnings	2024-02-23 10:21:47 +01:00
David Testé	83239e6afa	chore(bench): implement integer casting benchmarks	2024-02-23 10:21:47 +01:00
sarah el kazdadi	ef8cb0273f	fix(tfhe): update pulp and bytemuck to fix nightly breakage	2024-02-23 10:21:47 +01:00
tmontaigu	9b353bac2d	fix(integer): correct degree in small comparisons	2024-02-23 10:21:47 +01:00
tmontaigu	46d65f1f87	fix(capi): add missing function on FheBool - safe ser/de - classical ser/de - comparisons - scalar binary fn/comparisons - compact & compressed fhe bool encryption	2024-02-23 10:21:47 +01:00
tmontaigu	a63a2cb725	chore(hlapi): add tests for fhe_bool	2024-02-23 10:21:47 +01:00
tmontaigu	c45af05ec6	fix(integer): make encrypt_bool specify the degree encrypt_one_block does not leak information on the message. BooleanBlocks are meant for when we want to be explicit that the value is a boolean and are ok for this to be public. Thus it needs to correctly set the degree to 1 for other operations to properly take advantage of that	2024-02-23 10:21:47 +01:00
tmontaigu	584eaeb4ed	fix(shortint): fix bitwise opts degree We used `after_bitand/or/xor` on the ct_left after the lut had changed its degree. So the `after_bit` function computed the resulting using a wrong degree for the left ct.	2024-02-23 10:21:47 +01:00
tmontaigu	8d94ed2512	fix(hlapi): bind missing cuda bitnot	2024-02-23 10:21:47 +01:00
tmontaigu	b8d9dbe85b	refactor(hlapi): split long files of hlapi This splits the long base.rs files into multiple ones, to make it easier to navigate. There is no code changes appart from moving stuff.	2024-02-23 10:21:47 +01:00
tmontaigu	ad25340c33	feat(capi): add Cuda support - This adds GPU support in the C API - Also make ctest (cmake test launcher) print test output when it fails	2024-02-23 10:21:47 +01:00
Arthur Meyre	ad1ae0c8c2	chore(ci): update scripts and Makefile for future forward compatibility	2024-01-31 18:22:15 +01:00
Arthur Meyre	ee40906b8b	chore(ci): convert some make targets to be semver trick compatible	2024-01-31 18:22:15 +01:00
Arthur Meyre	bf6b4cc541	chore(tfhe): bump version to 0.5.1	2024-01-30 10:51:39 +01:00
Arthur Meyre	24404567a4	chore(tfhe): bump tfhe-cuda-backend version to 0.1.3	2024-01-30 10:51:39 +01:00
tmontaigu	052dd4a60e	feat(integer): fuse two PBS in comparisons In comparisons, we were reducing a vec of orderings (inferior, equal, superior) into one final ordering, and then we would do one final PBS to transform that into a boolean value (0 or 1) depending what was wanted (<=, <, >, >=). This fuse the last PBS (ordering -> boolean value) with the last round of reduction, when there are only two blocks left to be reduced. This allows to gain one PBS. Meaning for ciphertext/cipheretxt comparisons we get back the performance lost introduced by the fix in f4c220c1. And comparisons between a clear and ciphertext get an improvement.	2024-01-30 10:51:39 +01:00
tmontaigu	f8d829d076	fix(integer): add noise cleaning pbs in comparisons In comparisons we were packing blocks to then do a subtraction between them. However this goes above the noise limit that would guarentee the advertised error propability. To fix that we add a pbs to clean the noise. This pbs only needs to be added in the ciphertext/ciphertext comparisons. Making them slower by 1 PBS.	2024-01-30 10:51:39 +01:00
dependabot[bot]	d9761ca17e	chore(deps): bump codecov/codecov-action from 3.1.4 to 3.1.5 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3.1.4 to 3.1.5. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`eaaf4bedf3...4fe8c5f003`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	8d2e15347b	chore(deps): bump tj-actions/changed-files from 42.0.0 to 42.0.2 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.0 to 42.0.2. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`ae82ed4ae0...90a06d6ba9`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	a368257bc7	chore(deps): bump actions/upload-artifact from 4.1.0 to 4.3.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.1.0 to 4.3.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v4.1.0...26f96dfa697d77e81fd5907df203aa23a56210a8) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
David Testé	76d23d0c91	chore(bench): add ciphertexts sum to integer benchmarks	2024-01-30 10:51:39 +01:00
David Testé	ddc5002232	chore(bench): add pbs benchmarks on gpu	2024-01-30 10:51:39 +01:00
tmontaigu	c08c479616	docs(hlapi): document trivial encryption to debug	2024-01-30 10:51:39 +01:00
tmontaigu	f26afc16de	docs(hlapi): document how to use rayon	2024-01-30 10:51:39 +01:00
yuxizama	13f533f6fb	chore(docs): update readme links and badges	2024-01-30 10:51:39 +01:00
yuxizama	d9541e472b	chore(docs): update README.md Change support banner	2024-01-30 10:51:39 +01:00
Agnes Leroy	3453e45258	fix(gpu): make all async functions unsafe, fix cuda_drop binding, add missing sync	2024-01-30 10:51:39 +01:00
David Testé	55de96f046	chore(ci): add gpu tests from user documentation	2024-01-30 10:51:39 +01:00
Agnes Leroy	9747c06f6e	chore(gpu): fix formatting command	2024-01-30 10:51:39 +01:00
Agnes Leroy	00f72d2c13	chore(gpu): fix compilation when no nvidia gpu is available	2024-01-30 10:51:39 +01:00
tmontaigu	01f5cb9056	fix(integer): is_scalar_out_of_bounds handles bigger ct Fix a bug where in is_scalar_out_of_bounds, if the scalar was negative and the ciphertext a signed one with more blocks than the decomposed scalar, we would do an out of bound access (i.e a panic). This fixes that, this will fix doing signed_overflowing_mul on 256 bits where the bug first appeared	2024-01-30 10:51:39 +01:00
David Testé	d66e313fa4	chore(ci): fix inputs for gpu full benchmark workflow	2024-01-30 10:51:39 +01:00
Arthur Meyre	c9d530e642	fix(core): ignore value in the body when doing LWE encryption	2024-01-30 10:51:39 +01:00
Agnes Leroy	6c2096fe52	chore(gpu): rename "test vector" -> "luts" and "tvi" -> "lut_indexes"	2024-01-30 10:51:39 +01:00
Agnes Leroy	1e94134dda	chore(gpu): move around code in integer.h for better readability	2024-01-30 10:51:39 +01:00
tmontaigu	c76a60111c	fix(integer): fix cast in scalar_shift/rotate In scalar_shift/rotate, we get the number of bits to shift/rotate as a generic type, the can be casted to u64. We compute the total number of bits the ciphertext has, cast that number to the same type as the scalar, and do "shift % num_bits". However, if the number of bits computed exceeds the max value the scalar type can hold, we could end up doing a remainder with 0. e.g 256bits ciphertext and scalar type u8 => 256u64 casted to u8 results in 0. Fix that by casting the scalar value to u64.	2024-01-30 10:51:39 +01:00
tmontaigu	18ff400df2	chore(hlapi): remove leftover file This file was not correctly removed during the refactor	2024-01-30 10:51:39 +01:00
David Testé	3d31d09be5	chore(ci): change rust-toolchain action Github thrid-party Action actions-rs/toolchain is not maintained anymore. We switch to dtolnay/rust-toolchain.	2024-01-30 10:51:39 +01:00
David Testé	76322606f2	chore(ci): set rustbacktrace var to full to ease debug on failure	2024-01-30 10:51:39 +01:00
dependabot[bot]	bf58a9f0c6	chore(deps): bump actions/upload-artifact from 3.1.2 to 4.2.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3.1.2 to 4.2.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v3.1.2...694cdabd8bdb0f10b2cea11669e1bf5453eed0a6) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	64461c82b4	chore(deps): bump tj-actions/changed-files from 41.1.1 to 42.0.0 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 41.1.1 to 42.0.0. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`62f4729b5d...ae82ed4ae0`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
dependabot[bot]	339c84fbd9	chore(deps): bump actions/checkout from 3.5.3 to 4.1.1 Bumps [actions/checkout](https://github.com/actions/checkout) from 3.5.3 to 4.1.1. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3.5.3...b4ffde65f46336ab88eb53be808477a3936bae11) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-01-30 10:51:39 +01:00
Arthur Meyre	bc682a5ffb	docs(bench): add scalar benchmarks for integer	2024-01-29 16:42:32 +01:00
Arthur Meyre	2920daf2d9	chore(docs): fix link to 0.4 semver doc	2024-01-23 10:50:25 +01:00