add assert

2026-04-28 03:01:21 -04:00 · 2023-10-11 17:30:04 +02:00
669 changed files with 24093 additions and 99189 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,6 +1,6 @@
 ---
 name: Bug report
-about: Report a problem with TFHE-rs
+about: Report a problem with concrete
 title: ''
 labels: triage_required
 assignees: ''
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,6 +1,6 @@
 ---
 name: Feature request
-about: Suggest an idea for TFHE-rs
+about: Suggest an idea for concrete
 title: ''
 labels: feature_request
 assignees: ''
--- a/.github/workflows/approve_label.yml
+++ b/.github/workflows/approve_label.yml
@@ -1,34 +0,0 @@
-# Manage approved label in pull request
-name: PR approved label manager
-
-on:
-  pull_request:
-  pull_request_review:
-    types: [submitted]
-
-jobs:
-  trigger-tests:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: write
-    steps:
-      - name: Get current labels
-        uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
-
-      # Remove label if a push is performed after an approval
-      - name: Remove approved label
-        if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
-        uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        with:
-          # We use a PAT to have the same user (zama-bot) for label deletion as for creation.
-          github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-          labels: approved
-
-      # Add label only if the review is approved and if the label doesn't already exist
-      - name: Add approved label
-        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
-        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
-        with:
-          # We need to use a PAT to be able to trigger `labeled` event for the other workflow.
-          github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-          labels: approved
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -5,54 +5,66 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (fast-tests)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-
  fast-tests:
-    name: Fast CPU tests
-    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: stable
+          default: true

      - name: Run concrete-csprng tests
        run: |
@@ -108,29 +120,8 @@ jobs:
        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (fast-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, fast-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (fast-tests) failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -1,70 +0,0 @@
-# Compile and test tfhe-cuda-backend on an RTX 4090 machine
-name: TFHE Cuda Backend - 4090 full tests
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [labeled]
-
-jobs:
-  cuda-tests-linux:
-    name: CUDA tests (RTX 4090)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, '4090_test') }}
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: ["self-hosted", "4090-desktop"]
-
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: stable
-
-      - name: Run fmt checks
-        run: |
-          make check_fmt_gpu
-
-      - name: Run clippy checks
-        run: |
-          make clippy_gpu
-
-      - name: Run all tests
-        run: |
-          make test_gpu
-
-      - name: Run user docs tests
-        run: |
-          make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          make test_c_api_gpu
-
-      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        if: ${{ github.event_name == 'pull_request' }}
-        with:
-          labels: 4090_test
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -1,137 +0,0 @@
-# Compile and test tfhe-cuda-backend on an AWS instance
-name: TFHE Cuda Backend - Full tests
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-
-jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cuda-tests)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: gpu-test
-
-  cuda-tests-linux:
-    name: CUDA tests
-    needs: setup-ec2
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 9
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-
-      - name: Set up home
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: stable
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Run fmt checks
-        run: |
-          make check_fmt_gpu
-
-      - name: Run clippy checks
-        run: |
-          make clippy_gpu
-
-      - name: Run all tests
-        run: |
-          make test_gpu
-
-      - name: Run user docs tests
-        run: |
-          make test_user_doc_gpu
-
-      - name: Test C API
-        run: |
-          make test_c_api_gpu
-
-
-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cuda-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cuda-tests-linux ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: gpu-test
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cuda-tests) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -1,75 +1,77 @@
-name: AWS Unsigned Integer Tests on CPU
+name: AWS Integer Tests on CPU

 env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (unsigned-integer-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-
-  unsigned-integer-tests:
-    name: Unsigned integer tests
-    needs: setup-ec2
+  integer-tests:
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: stable
-
-      - name: Gen Keys if required
-        run: |
-          make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
-
-      - name: Run unsigned integer multi-bit tests
-        run: |
-          AVX512_SUPPORT=ON make test_unsigned_integer_multi_bit_ci
+          default: true

      - name: Gen Keys if required
        run: |
          make gen_key_cache

-      - name: Run unsigned integer tests
+      - name: Run integer tests
        run: |
-          AVX512_SUPPORT=ON BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci
+          BIG_TESTS_INSTANCE=TRUE make test_integer_ci

      - name: Slack Notification
        if: ${{ always() }}
@@ -77,29 +79,8 @@ jobs:
        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (unsigned-integer-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, unsigned-integer-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_multi_bit_tests.yml
+++ b/.github/workflows/aws_tfhe_multi_bit_tests.yml
@@ -0,0 +1,90 @@
+name: AWS Multi Bit Tests on CPU
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string
+
+jobs:
+  multi-bit-tests:
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
+      cancel-in-progress: true
+    runs-on: ${{ inputs.runner_name }}
+    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          default: true
+
+      - name: Gen Keys if required
+        run: |
+          make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
+
+      - name: Run shortint multi-bit tests
+        run: |
+          make test_shortint_multi_bit_ci
+
+      - name: Run integer multi-bit tests
+        run: |
+          make test_integer_multi_bit_ci
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -1,109 +0,0 @@
-name: AWS Signed Integer Tests on CPU
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
-
-jobs:
-  setup-ec2:
-    name: Setup EC2 instance (signed-integer-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-
-  signed-integer-tests:
-    name: Signed integer tests
-    needs: setup-ec2
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-
-      - name: Set up home
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: stable
-
-      - name: Gen Keys if required
-        run: |
-          make GEN_KEY_CACHE_MULTI_BIT_ONLY=TRUE gen_key_cache
-
-      - name: Run shortint multi-bit tests
-        run: |
-          make test_shortint_multi_bit_ci
-
-      - name: Run signed integer multi-bit tests
-        run: |
-          AVX512_SUPPORT=ON make test_signed_integer_multi_bit_ci
-
-      - name: Gen Keys if required
-        run: |
-          make gen_key_cache
-
-      - name: Run signed integer tests
-        run: |
-          AVX512_SUPPORT=ON BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci
-
-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (signed-integer-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, signed-integer-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -4,56 +4,66 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (cpu-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-
-  cpu-tests:
-    name: CPU tests
-    needs: setup-ec2
+  shortint-tests:
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: stable
+          default: true

      - name: Run concrete-csprng tests
        run: |
@@ -90,12 +100,6 @@ jobs:
      - name: Run example tests
        run: |
          make test_examples
-          make dark_market
-
-      - name: Run apps tests
-        run: |
-          make test_trivium
-          make test_kreyvium

      - name: Slack Notification
        if: ${{ always() }}
@@ -103,29 +107,8 @@ jobs:
        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (cpu-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cpu-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-big
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (cpu-tests) failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -4,56 +4,66 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string

 jobs:
-  setup-ec2:
-    name: Setup EC2 instance (wasm-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-small
-
  wasm-tests:
-    name: WASM tests
-    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    runs-on: ${{ inputs.runner_name }}
    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: stable
+          default: true

      - name: Run js on wasm API tests
        run: |
@@ -70,29 +80,8 @@ jobs:
        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (wasm-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, wasm-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-small
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (wasm-tests) failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -32,7 +32,6 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
  run-boolean-benchmarks:
@@ -52,7 +51,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -62,13 +61,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make bench_boolean
+          make AVX512_SUPPORT=ON bench_boolean

      - name: Parse results
        run: |
@@ -96,13 +96,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -6,7 +6,6 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -18,11 +17,11 @@ jobs:

    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest-large, windows-latest]
+        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false

    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608

      - name: Install and run newline linter checks
        if: matrix.os == 'ubuntu-latest'
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -4,7 +4,6 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -39,7 +38,6 @@ jobs:
      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
    runs-on: ${{ inputs.runner_name }}
-    timeout-minutes: 1080
    steps:
      # Step used for log purpose.
      - name: Instance configuration used
@@ -52,7 +50,7 @@ jobs:
          echo "Fork git sha: ${{ inputs.fork_git_sha }}"

      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: ${{ inputs.fork_repo }}
          ref: ${{ inputs.fork_git_sha }}
@@ -62,13 +60,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: stable
+          default: true

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ec75ae5ab7296b81fd4cddb77294d6718932ebab
+        uses: tj-actions/changed-files@db153baf731265ad02cd490b07f470e2d55e3345
        with:
          files_yaml: |
            tfhe:
@@ -80,12 +79,6 @@ jobs:
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        run: |
          make GEN_KEY_CACHE_COVERAGE_ONLY=TRUE gen_key_cache
-          make gen_key_cache_core_crypto
-
-      - name: Run coverage for core_crypto
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
-        run: |
-          make test_core_crypto_cov AVX512_SUPPORT=ON

      - name: Run coverage for boolean
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
@@ -98,13 +91,13 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@e0b68c6749509c5f83f984dd99a76a1c1a231044
+        uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          directory: ./coverage/
          fail_ci_if_error: true
-          files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml
+          files: shortint/cobertura.xml,boolean/cobertura.xml

      - name: Slack Notification
        if: ${{ failure() }}
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -1,153 +0,0 @@
-# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
-name: Core crypto GPU benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-
-jobs:
-  run-core-crypto-benchmarks:
-    name: Execute GPU core crypto benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 9
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make bench_pbs_gpu
-          make bench_ks_gpu
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --name-suffix avx512 \
-          --walk-subdirs \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_core_crypto
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "PBS GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/csprng_randomness_testing.yml
+++ b/.github/workflows/csprng_randomness_testing.yml
@@ -0,0 +1,74 @@
+name: CSPRNG randomness testing Workflow
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string
+
+jobs:
+  csprng-randomness-teting:
+    name: CSPRNG randomness testing
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
+      cancel-in-progress: true
+    runs-on: ${{ inputs.runner_name }}
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          default: true
+
+      - name: Dieharder randomness test suite
+        run: |
+          make dieharder_csprng
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -1,94 +0,0 @@
-name: CSPRNG randomness testing Workflow
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
-
-
-jobs:
-  setup-ec2:
-    name: Setup EC2 instance (csprng-randomness-tests)
-    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-instance.outputs.label }}
-      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
-    steps:
-      - name: Start instance
-        id: start-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-small
-
-  csprng-randomness-tests:
-    name: CSPRNG randomness tests
-    needs: setup-ec2
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-
-      - name: Set up home
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: stable
-
-      - name: Dieharder randomness test suite
-        run: |
-          make dieharder_csprng
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-
-  teardown-ec2:
-    name: Teardown EC2 instance (csprng-randomness-tests)
-    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, csprng-randomness-tests ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop instance
-        id: stop-instance
-        uses: zama-ai/slab-github-runner@ab65ad70bb9f9e9251e4915ea5612bcad23cd9b1
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          profile: cpu-small
-          label: ${{ needs.setup-ec2.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -25,7 +25,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -45,7 +44,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -55,13 +54,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE bench_integer
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer

      - name: Parse benchmarks to csv
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,13 +90,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -28,7 +28,6 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
  prepare-matrix:
@@ -40,12 +39,15 @@ jobs:
      - name: Weekly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\", \"default_comp\", \"default_scalar\", \"default_scalar_comp\"]" >> ${GITHUB_ENV}

      - name: Quarterly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\", \"default_comp\", \"default_scalar\", \"default_scalar_comp\", \
+          \"smart\", \"smart_comp\", \"smart_scalar\", \"smart_parallelized\", \"smart_parallelized_comp\", \"smart_scalar_parallelized\", \"smart_scalar_parallelized_comp\", \
+          \"unchecked\", \"unchecked_comp\", \"unchecked_scalar\", \"unchecked_scalar_comp\", \
+          \"misc\"]" >> ${GITHUB_ENV}

      -  name: Set operation flavor output
         id: set_op_flavor
@@ -58,7 +60,6 @@ jobs:
    runs-on: ${{ github.event.inputs.runner_name }}
    if: ${{ !cancelled() }}
    continue-on-error: true
-    timeout-minutes: 1440  # 24 hours
    strategy:
      max-parallel: 1
      matrix:
@@ -73,7 +74,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -89,12 +90,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
@@ -102,7 +104,7 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
+          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}

      - name: Parse results
        run: |
@@ -118,7 +120,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -1,157 +0,0 @@
-# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
-name: Integer GPU benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute integer benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 9
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -1,162 +0,0 @@
-# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
-name: Integer GPU full benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      # This input is not used in this workflow but still mandatory since a calling workflow could
-      # use it. If a triggering command include a user_inputs field, then the triggered workflow
-      # must include this very input, otherwise the workflow won't be called.
-      # See start_full_benchmarks.yml as example.
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-
-jobs:
-  integer-benchmarks:
-    name: Execute integer benchmarks for all operations flavor
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    continue-on-error: true
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [ integer, integer_multi_bit]
-        op_flavor: [ default, unchecked ]
-        # explicit include-based build matrix, of known valid options
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.2"
-            gcc: 9
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          fetch-depth: 0
-
-      - name: Get benchmark details
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-  slack-notification:
-    name: Slack Notification
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ failure() }}
-    needs: integer-benchmarks
-    steps:
-      - name: Notify
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -25,7 +25,6 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -45,7 +44,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -55,13 +54,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Run multi-bit benchmarks with AVX512
        run: |
-          make FAST_BENCH=TRUE bench_integer_multi_bit
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer_multi_bit

      - name: Parse benchmarks to csv
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,13 +90,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -1,158 +0,0 @@
-# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
-name: Integer Multi-bit benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute integer multi-bit benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "11.8"
-            cuda_arch: "70"
-            gcc: 9
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --backend gpu \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -14,8 +14,8 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  CARGO_PROFILE: release_lto_off
  FAST_TESTS: "TRUE"

 concurrency:
@@ -28,12 +28,13 @@ jobs:
    runs-on: ["self-hosted", "m1mac"]

    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+      - uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: stable
+          default: true

      - name: Run pcc checks
        run: |
@@ -110,9 +111,10 @@ jobs:
        run: |
          make test_shortint_multi_bit_ci

-      - name: Run integer multi bit tests
-        run: |
-          make test_integer_multi_bit_ci
+      # # These multi bit integer tests are too slow on M1 with low core count and low RAM
+      # - name: Run integer multi bit tests
+      #   run: |
+      #     make test_integer_multi_bit_ci

  remove_label:
    name: Remove m1_test label
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -30,7 +30,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -49,7 +49,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
+        uses: JS-DevTools/npm-publish@fe72237be0920f7a0cafd6a966c9b929c9466e9b
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
@@ -65,7 +65,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@4b07b26a2f6e0a51846e1870223e545bae91c552
+        uses: JS-DevTools/npm-publish@fe72237be0920f7a0cafd6a966c9b929c9466e9b
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -17,10 +17,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608

      - name: Checkout lattice-estimator
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -1,5 +1,5 @@
-# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Core crypto benchmarks
+# Run PBS benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: PBS benchmarks

 on:
  workflow_dispatch:
@@ -32,11 +32,10 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
-  run-core-crypto-benchmarks:
-    name: Execute core crypto benchmarks in EC2
+  run-pbs-benchmarks:
+    name: Execute PBS benchmarks in EC2
    runs-on: ${{ github.event.inputs.runner_name }}
    if: ${{ !cancelled() }}
    steps:
@@ -52,7 +51,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -62,14 +61,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make bench_pbs
-          make bench_ks
+          make AVX512_SUPPORT=ON bench_pbs

      - name: Parse results
        run: |
@@ -87,13 +86,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
-          name: ${{ github.sha }}_core_crypto
+          name: ${{ github.sha }}_pbs
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -24,7 +24,6 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
  run-shortint-benchmarks:
@@ -44,7 +43,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -54,13 +53,14 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Run benchmarks with AVX512
        run: |
-          make bench_shortint
+          make AVX512_SUPPORT=ON bench_shortint

      - name: Parse results
        run: |
@@ -88,13 +88,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_shortint
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -32,7 +32,6 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
  shortint-benchmarks:
@@ -52,7 +51,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -68,12 +67,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
@@ -81,7 +81,7 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
+          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint

      - name: Parse results
        run: |
@@ -112,7 +112,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -1,129 +0,0 @@
-# Run signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Signed Integer benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute signed integer benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE bench_signed_integer
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -1,133 +0,0 @@
-# Run all signed integer benchmarks on an AWS instance and return parsed results to Slab CI bot.
-name: Signed Integer full benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-      user_inputs:
-        description: "Type of benchmarks to run"
-        type: string
-        default: "weekly_benchmarks"
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-
-jobs:
-  integer-benchmarks:
-    name: Execute signed integer benchmarks for all operations flavor
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    continue-on-error: true
-    timeout-minutes: 1440  # 24 hours
-    strategy:
-      max-parallel: 1
-      matrix:
-        command: [ integer, integer_multi_bit ]
-        op_flavor: [ default, unchecked ]
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          fetch-depth: 0
-
-      - name: Get benchmark details
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-  slack-notification:
-    name: Slack Notification
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ failure() }}
-    needs: integer-benchmarks
-    steps:
-      - name: Notify
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -1,129 +0,0 @@
-# Run signed integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
-name: Signed Integer Multi-bit benchmarks
-
-on:
-  workflow_dispatch:
-    inputs:
-      instance_id:
-        description: "Instance ID"
-        type: string
-      instance_image_id:
-        description: "Instance AMI ID"
-        type: string
-      instance_type:
-        description: "Instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: "Slab request ID"
-        type: string
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-
-jobs:
-  run-integer-benchmarks:
-    name: Execute signed integer multi-bit benchmarks in EC2
-    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ !cancelled() }}
-    steps:
-      - name: Instance configuration used
-        run: |
-          echo "IDs: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-
-      - name: Get benchmark date
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-
-      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          fetch-depth: 0
-
-      - name: Set up home
-        # "Install rust" step require root user to have a HOME directory which is not set.
-        run: |
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Run multi-bit benchmarks with AVX512
-        run: |
-          make FAST_BENCH=TRUE bench_signed_integer_multi_bit
-
-      - name: Parse benchmarks to csv
-        run: |
-          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
-            parse_integer_benches
-
-      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_csv_integer
-          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
-
-      - name: Parse results
-        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware ${{ inputs.instance_type }} \
-          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --name-suffix avx512 \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_integer
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "Signed integer benchmarks failed. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -20,24 +20,12 @@ on:
        description: "Run integer benches"
        type: boolean
        default: true
-      signed_integer_bench:
-        description: "Run signed integer benches"
-        type: boolean
-        default: true
      integer_multi_bit_bench:
        description: "Run integer multi bit benches"
        type: boolean
        default: true
-      signed_integer_multi_bit_bench:
-        description: "Run signed integer multi bit benches"
-        type: boolean
-        default: true
-      core_crypto_bench:
-        description: "Run core crypto benches"
-        type: boolean
-        default: true
-      core_crypto_gpu_bench:
-        description: "Run core crypto benches on GPU"
+      pbs_bench:
+        description: "Run PBS benches"
        type: boolean
        default: true
      wasm_client_bench:
@@ -50,21 +38,17 @@ jobs:
    if: ${{ (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
    strategy:
      matrix:
-        command: [ boolean_bench, shortint_bench,
-                   integer_bench, integer_multi_bit_bench,
-                   signed_integer_bench, signed_integer_multi_bit_bench,
-                   integer_gpu_bench, integer_multi_bit_gpu_bench,
-                   core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
+        command: [boolean_bench, shortint_bench, integer_bench, integer_multi_bit_bench, pbs_bench, wasm_client_bench]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ec75ae5ab7296b81fd4cddb77294d6718932ebab
+        uses: tj-actions/changed-files@db153baf731265ad02cd490b07f470e2d55e3345
        with:
          files_yaml: |
            common_benches:
@@ -85,33 +69,23 @@ jobs:
            integer_bench:
              - tfhe/src/shortint/**
              - tfhe/src/integer/**
-              - tfhe/benches/integer/bench.rs
+              - tfhe/benches/integer/**
              - .github/workflows/integer_benchmark.yml
            integer_multi_bit_bench:
              - tfhe/src/shortint/**
              - tfhe/src/integer/**
-              - tfhe/benches/integer/bench.rs
-              - .github/workflows/integer_multi_bit_benchmark.yml
-            signed_integer_bench:
-              - tfhe/src/shortint/**
-              - tfhe/src/integer/**
-              - tfhe/benches/integer/signed_bench.rs
-              - .github/workflows/signed_integer_benchmark.yml
-            signed_integer_multi_bit_bench:
-              - tfhe/src/shortint/**
-              - tfhe/src/integer/**
-              - tfhe/benches/integer/signed_bench.rs
-              - .github/workflows/signed_integer_multi_bit_benchmark.yml
-            core_crypto_bench:
+              - tfhe/benches/integer/**
+              - .github/workflows/integer_benchmark.yml
+            pbs_bench:
              - tfhe/src/core_crypto/**
              - tfhe/benches/core_crypto/**
-              - .github/workflows/core_crypto_benchmark.yml
+              - .github/workflows/pbs_benchmark.yml
            wasm_client_bench:
              - tfhe/web_wasm_parallel_tests/**
              - .github/workflows/wasm_client_benchmark.yml

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -24,18 +24,16 @@ jobs:
    if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
    strategy:
      matrix:
-        command: [ boolean_bench, shortint_full_bench,
-                   integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
-                   core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
+        command: [ boolean_bench, shortint_full_bench, integer_full_bench, pbs_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -13,11 +13,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0
      - name: Save repo
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: repo-archive
          path: '.'
--- a/.github/workflows/trigger_aws_tests_on_pr.yml
+++ b/.github/workflows/trigger_aws_tests_on_pr.yml
@@ -0,0 +1,54 @@
+# Trigger an AWS build each time commits are pushed to a pull request.
+name: PR AWS build trigger
+
+on:
+  pull_request:
+  pull_request_review:
+    types: [submitted]
+
+jobs:
+  trigger-tests:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Get current labels
+        uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
+
+      - name: Remove approved label
+        if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
+        uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          labels: approved
+
+      - name: Launch fast tests
+        if: ${{ github.event_name == 'pull_request' }}
+        uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
+        with:
+          allow-repeats: true
+          message: |
+            @slab-ci cpu_fast_test
+
+      - name: Add approved label
+        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
+        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          labels: approved
+
+      # PR label 'approved' presence is checked to avoid running the full test suite several times
+      # in case of multiple approvals without new commits in between.
+      - name: Launch full tests suite
+        if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
+        uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
+        with:
+          allow-repeats: true
+          message: |
+            Pull Request has been approved :tada:
+            Launching full test suite...
+            @slab-ci cpu_test
+            @slab-ci cpu_integer_test
+            @slab-ci cpu_multi_bit_test
+            @slab-ci cpu_wasm_test
+            @slab-ci csprng_randomness_testing
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -32,7 +32,6 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"

 jobs:
  run-wasm-client-benchmarks:
@@ -52,7 +51,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          fetch-depth: 0

@@ -62,9 +61,10 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
        with:
          toolchain: nightly
+          override: true

      - name: Run benchmarks
        run: |
@@ -97,13 +97,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608
        with:
          repository: zama-ai/slab
          path: slab
--- a/.gitignore
+++ b/.gitignore
@@ -3,9 +3,9 @@ target/
 .vscode/

 # Path we use for internal-keycache during tests
-/keys/
+./keys/
 # In case of symlinked keys
-/keys
+./keys

 **/Cargo.lock
 **/*.bin
@@ -18,4 +18,4 @@ target/
 dieharder_run.log

 # Coverage reports
-/coverage/
+./coverage/
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [workspace]
 resolver = "2"
-members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng", "backends/tfhe-cuda-backend"]
+members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng"]

 [profile.bench]
 lto = "fat"
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2024 ZAMA.
+Copyright © 2023 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/362
+++ b/362
@@ -6,7 +6,7 @@ TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
 CARGO_PROFILE?=release
-MIN_RUST_VERSION:=$(shell grep '^rust-version[[:space:]]*=' tfhe/Cargo.toml | cut -d '=' -f 2 | xargs)
+MIN_RUST_VERSION:=$(shell grep rust-version tfhe/Cargo.toml | cut -d '=' -f 2 | xargs)
 AVX512_SUPPORT?=OFF
 WASM_RUSTFLAGS:=
 BIG_TESTS_INSTANCE?=FALSE
@@ -16,18 +16,6 @@ PARSE_INTEGER_BENCH_CSV_FILE?=tfhe_rs_integer_benches.csv
 FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
-NODE_VERSION=20
-FORWARD_COMPAT?=OFF
-# sed: -n, do not print input stream, -e means a script/expression
-# 1,/version/ indicates from the first line, to the line matching version at the start of the line
-# p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
-# entry which should be the version of tfhe
-TFHE_CURRENT_VERSION:=\
-$(shell sed -n -e '1,/^version/p' tfhe/Cargo.toml | \
-grep '^version[[:space:]]*=' | cut -d '=' -f 2 | xargs)
-# Cargo has a hard time distinguishing between our package from the workspace and a package that
-# could be a dependency, so we build an unambiguous spec here
-TFHE_SPEC:=tfhe@$(TFHE_CURRENT_VERSION)
 # This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
 # copy paste the command in the terminal and change them if required without forgetting the flags
 export RUSTFLAGS?=-C target-cpu=native
@@ -50,20 +38,10 @@ else
 		COVERAGE_ONLY=
 endif

-ifeq ($(FORWARD_COMPAT),ON)
-		FORWARD_COMPAT_FEATURE=forward_compatibility
-else
-		FORWARD_COMPAT_FEATURE=
-endif
-
 # Variables used only for regex_engine example
 REGEX_STRING?=''
 REGEX_PATTERN?=''

-# tfhe-cuda-backend
-TFHECUDA_SRC="backends/tfhe-cuda-backend/cuda"
-TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
-
 # Exclude these files from coverage reports
 define COVERAGE_EXCLUDED_FILES
 --exclude-files apps/trivium/src/trivium/* \
@@ -121,7 +99,7 @@ install_wasm_pack: install_rs_build_toolchain
 install_node:
 	curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.3/install.sh | $(SHELL)
 	source ~/.bashrc
-	$(SHELL) -i -c 'nvm install $(NODE_VERSION)' || \
+	$(SHELL) -i -c 'nvm install node' || \
 	( echo "Unable to install node, unknown error." && exit 1 )

 .PHONY: install_dieharder # Install dieharder for apt distributions or macOS
@@ -142,32 +120,16 @@ install_tarpaulin: install_rs_build_toolchain
 .PHONY: check_linelint_installed # Check if linelint newline linter is installed
 check_linelint_installed:
 	@printf "\n" | linelint - > /dev/null 2>&1 || \
-	( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )
+	( echo "Unable to locate linelint. Try installing it:  https://github.com/fernandrone/linelint/releases" && exit 1 )

 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt

-.PHONY: fmt_gpu # Format rust and cuda code
-fmt_gpu: install_rs_check_toolchain
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
-	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh
-
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check

-.PHONY: check_fmt_gpu # Check rust and cuda code format
-check_fmt_gpu: install_rs_check_toolchain
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
-	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
-
-.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
-clippy_gpu: install_rs_check_toolchain clippy_cuda_backend
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
-
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -180,52 +142,46 @@ check_newline: check_linelint_installed
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE) \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),boolean \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_shortint # Run clippy lints enabling the shortint features
 clippy_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),shortint \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_integer # Run clippy lints enabling the integer features
 clippy_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),integer \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
 clippy: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
 clippy_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
 clippy_tasks:
@@ -234,14 +190,15 @@ clippy_tasks:

 .PHONY: clippy_trivium # Run clippy lints on Trivium app
 clippy_trivium: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		-p tfhe-trivium -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy -p tfhe-trivium \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,safe-deserialization \
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
 clippy_concrete_csprng:
@@ -257,75 +214,62 @@ clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_triviu
 clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
 clippy_concrete_csprng

-.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
-clippy_cuda_backend: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		-p tfhe-cuda-backend -- --no-deps -D warnings
+.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
+gen_key_cache: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
+		--example generates_test_keys \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -p tfhe -- \
+		$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE) -p tfhe
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),$(AVX512_FEATURE) -p $(TFHE_SPEC); \
+			--features=$(TARGET_ARCH_FEATURE),$(AVX512_FEATURE) -p tfhe; \
 	fi

 .PHONY: build_core_experimental # Build core_crypto with experimental features
 build_core_experimental: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),experimental -p tfhe
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC); \
+			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p tfhe; \
 	fi

 .PHONY: build_boolean # Build with boolean enabled
 build_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) --all-targets
+		--features=$(TARGET_ARCH_FEATURE),boolean -p tfhe --all-targets

 .PHONY: build_shortint # Build with shortint enabled
 build_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),shortint -p $(TFHE_SPEC) --all-targets
+		--features=$(TARGET_ARCH_FEATURE),shortint -p tfhe --all-targets

 .PHONY: build_integer # Build with integer enabled
 build_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) --all-targets
+		--features=$(TARGET_ARCH_FEATURE),integer -p tfhe --all-targets

 .PHONY: build_tfhe_full # Build with boolean, shortint and integer enabled
 build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
-
-.PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
-symlink_c_libs_without_fingerprint:
-	@./scripts/symlink_c_libs_without_fingerprint.sh \
-		--cargo-profile "$(CARGO_PROFILE)" \
-		--lib-name tfhe-c-api-dynamic-buffer
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p tfhe --all-targets

 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,$(FORWARD_COMPAT_FEATURE) \
-		-p $(TFHE_SPEC)
-	@"$(MAKE)" symlink_c_libs_without_fingerprint
-
-.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
-build_c_api_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,gpu \
-		-p $(TFHE_SPEC)
-	@"$(MAKE)" symlink_c_libs_without_fingerprint
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api, \
+		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
-		-p $(TFHE_SPEC)
-	@"$(MAKE)" symlink_c_libs_without_fingerprint
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4 \
+		-p tfhe

 .PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_rs_build_toolchain install_wasm_pack
@@ -355,60 +299,19 @@ build_concrete_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng --all-targets

-#.PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
-#test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
-#	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-#		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC) -- core_crypto::
-#	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
-#		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-#			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
-#	fi
-
 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC) -- glwe_encrypt_tensor_prod_decrypt
+		--features=$(TARGET_ARCH_FEATURE),experimental -p tfhe -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,$(AVX512_FEATURE) -p tfhe -- core_crypto::; \
 	fi

-.PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
-test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain install_tarpaulin
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
-		--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
-		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage \
-		-p $(TFHE_SPEC) -- core_crypto::
-	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
-		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
-			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
-			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage,$(AVX512_FEATURE) \
-			-p $(TFHE_SPEC) -- core_crypto::; \
-	fi
-
-.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu
-
-.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
-
-.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
-
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) -- boolean::
+		--features=$(TARGET_ARCH_FEATURE),boolean -p tfhe -- boolean::

 .PHONY: test_boolean_cov # Run the tests of the boolean module with code coverage
 test_boolean_cov: install_rs_check_toolchain install_tarpaulin
@@ -416,13 +319,13 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 		--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
 		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
-		-p $(TFHE_SPEC) -- boolean::
+		-p tfhe -- boolean::

 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
-		-p $(TFHE_SPEC) \
+		-p tfhe \
 		c_api

 .PHONY: test_c_api_c # Run the C tests for the C API
@@ -432,28 +335,24 @@ test_c_api_c: build_c_api
 .PHONY: test_c_api # Run all the tests for the C API
 test_c_api: test_c_api_rs test_c_api_c

-.PHONY: test_c_api_gpu # Run the C tests for the C API
-test_c_api_gpu: build_c_api_gpu
-	./scripts/c_api_tests.sh --gpu
-
 .PHONY: test_shortint_ci # Run the tests for shortint ci
 test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "$(TFHE_SPEC)"
+		--cargo-profile "$(CARGO_PROFILE)"

 .PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
 test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "$(TFHE_SPEC)"
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit

 .PHONY: test_shortint # Run all the tests for shortint
 test_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p $(TFHE_SPEC) -- shortint::
+		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe -- shortint::

 .PHONY: test_shortint_cov # Run the tests of the shortint module with code coverage
 test_shortint_cov: install_rs_check_toolchain install_tarpaulin
@@ -461,90 +360,44 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 		--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
 		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
-		-p $(TFHE_SPEC) -- shortint::
+		-p tfhe -- shortint::

 .PHONY: test_integer_ci # Run the tests for integer ci
-test_integer_ci: install_rs_check_toolchain install_cargo_nextest
+test_integer_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)"
-
-.PHONY: test_unsigned_integer_ci # Run the tests for unsigned integer ci
-test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
-	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
-	FAST_TESTS="$(FAST_TESTS)" \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
-
-.PHONY: test_signed_integer_ci # Run the tests for signed integer ci
-test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
-	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
-	FAST_TESTS="$(FAST_TESTS)" \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only --tfhe-package "$(TFHE_SPEC)"
+		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
+		--cargo-profile "$(CARGO_PROFILE)"

 .PHONY: test_integer_multi_bit_ci # Run the tests for integer ci running only multibit tests
-test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
+test_integer_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)"
-
-.PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
-test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
-	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
-	FAST_TESTS="$(FAST_TESTS)" \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
-
-.PHONY: test_signed_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
-test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
-	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
-	FAST_TESTS="$(FAST_TESTS)" \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only --tfhe-package "$(TFHE_SPEC)"
+		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit

 .PHONY: test_safe_deserialization # Run the tests for safe deserialization
 test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_deserialization::
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,safe-deserialization -p tfhe -- safe_deserialization::

 .PHONY: test_integer # Run all the tests for integer
 test_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p tfhe -- integer::

 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p tfhe \
 		-- high_level_api::

 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p tfhe \
 		-- test_user_docs::

-.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
-test_user_doc_gpu: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
-		-- test_user_docs::
-
-.PHONY: test_fhe_strings # Run tests for fhe_strings example
-test_fhe_strings: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--example fhe_strings \
-		--features=$(TARGET_ARCH_FEATURE),integer
-
 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -579,7 +432,7 @@ test_concrete_csprng:
 doc: install_rs_check_toolchain
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps

 .PHONY: docs # Build rust doc alias for doc
 docs: doc
@@ -588,7 +441,7 @@ docs: doc
 lint_doc: install_rs_check_toolchain
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -605,18 +458,18 @@ format_doc_latex:
 .PHONY: check_compile_tests # Build tests in debug without running them
 check_compile_tests:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
-		-p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,safe-deserialization \
+		-p tfhe

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
-		"$(MAKE)" build_c_api && \
+		"$(MAKE)" build_c_api; \
 		./scripts/c_api_tests.sh --build-only; \
 	fi

 .PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
 build_nodejs_test_docker:
 	DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
-		-f docker/Dockerfile.wasm_tests --build-arg NODE_VERSION=$(NODE_VERSION) -t tfhe-wasm-tests .
+		-f docker/Dockerfile.wasm_tests -t tfhe-wasm-tests .

 .PHONY: test_nodejs_wasm_api_in_docker # Run tests for the nodejs on wasm API in a docker container
 test_nodejs_wasm_api_in_docker: build_nodejs_test_docker
@@ -640,8 +493,7 @@ test_web_js_api_parallel: build_web_js_api_parallel
 .PHONY: ci_test_web_js_api_parallel # Run tests for the web wasm api
 ci_test_web_js_api_parallel: build_web_js_api_parallel
 	source ~/.nvm/nvm.sh && \
-	nvm install $(NODE_VERSION) && \
-	nvm use $(NODE_VERSION) && \
+	nvm use node && \
 	$(MAKE) -C tfhe/web_wasm_parallel_tests test-ci

 .PHONY: no_tfhe_typo # Check we did not invert the h and f in tfhe
@@ -660,70 +512,27 @@ dieharder_csprng: install_dieharder build_concrete_csprng
 # Benchmarks
 #

-.PHONY: bench_integer # Run benchmarks for unsigned integer
+.PHONY: bench_integer # Run benchmarks for integer
 bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p tfhe --

-.PHONY: bench_signed_integer # Run benchmarks for signed integer
-bench_signed_integer: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
-
-.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
-bench_integer_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
-
-.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
+.PHONY: bench_integer_multi_bit # Run benchmarks for integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
-
-.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
-bench_signed_integer_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
-	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
-
-.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
-bench_integer_multi_bit_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
-	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p tfhe --

 .PHONY: bench_shortint # Run benchmarks for shortint
 bench_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_oprf # Run benchmarks for shortint
-bench_oprf: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench oprf-shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-	RUSTFLAGS="$(RUSTFLAGS)" \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench oprf-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe

 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
@@ -731,38 +540,20 @@ bench_shortint_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe --


 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench boolean-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,$(AVX512_FEATURE) -p tfhe

 .PHONY: bench_pbs # Run benchmarks for PBS
 bench_pbs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
-bench_pbs_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench pbs-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_ks # Run benchmarks for keyswitch
-bench_ks: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench ks-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
-bench_ks_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench ks-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe

 .PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
 bench_web_js_api_parallel: build_web_js_api_parallel
@@ -777,18 +568,6 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
 #
 # Utility tools
 #
-.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
-gen_key_cache: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-		--example generates_test_keys \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
-		$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
-
-.PHONY: gen_key_cache_core_crypto # Run function to generate keys and cache them for core_crypto tests
-gen_key_cache_core_crypto: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --tests --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache -p $(TFHE_SPEC) -- --nocapture \
-		core_crypto::keycache::generate_keys

 .PHONY: measure_hlapi_compact_pk_ct_sizes # Measure sizes of public keys and ciphertext for high-level API
 measure_hlapi_compact_pk_ct_sizes: install_rs_check_toolchain
@@ -851,12 +630,9 @@ sha256_bool: install_rs_check_toolchain
 	--example sha256_bool \
 	--features=$(TARGET_ARCH_FEATURE),boolean

-.PHONY: pcc # pcc stands for pre commit checks (except GPU)
+.PHONY: pcc # pcc stands for pre commit checks
 pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests

-.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
-pcc_gpu: pcc clippy_gpu
-
 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests

--- a/README.md
+++ b/README.md
@@ -2,66 +2,36 @@
 <!-- product name logo -->
  <img width=600 src="https://user-images.githubusercontent.com/5758427/231206749-8f146b97-3c5a-4201-8388-3ffa88580415.png">
 </p>
-
+<hr/>
+<p align="center">
+  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a>
+</p>
+<p align="center">
+<!-- Version badge using shields.io -->
+  <a href="https://github.com/zama-ai/tfhe-rs/releases">
+    <img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
+  </a>
+<!-- Zama Bounty Program -->
+  <a href="https://github.com/zama-ai/bounty-program">
+    <img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
+  </a>
+</p>
 <hr/>

-<p align="center">
-  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
-</p>

+**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer
+arithmetics over encrypted data. It includes:
+ - a **Rust** API
+ - a **C** API
+ - and a **client-side WASM** API

-<p align="center">
-  <a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
-  <a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
-  <a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
-</p>
-
-## About
-
-### What is TFHE-rs
-
-**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer arithmetics over encrypted data.
-
-It includes:
- a **Rust** API
- a **C** API
- and a **client-side WASM** API
-
-TFHE-rs is designed for developers and researchers who want full control over
-what they can do with TFHE, while not having to worry about the low-level
+**TFHE-rs** is meant for developers and researchers who want full control over
+what they can do with TFHE, while not having to worry about the low level
 implementation. The goal is to have a stable, simple, high-performance, and
 production-ready library for all the advanced features of TFHE.
-<br></br>
-
-### Main features
-
- **Low-level cryptographic library** that implements Zama’s variant of TFHE, including programmable bootstrapping
- **Implementation of the original TFHE boolean API** that can be used as a drop-in replacement for other TFHE libraries
- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
- **Size-efficient public key encryption**
- **Ciphertext and server key compression** for efficient data transfer
- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
-
-*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
-<br></br>
-
-## Table of Contents
- **[Getting Started](#getting-started)**
-   - [Cargo.toml configuration](#cargotoml-configuration)
-   - [A simple example](#a-simple-example)
- **[Resources](#resources)**
-   - [TFHE deep dive](#tfhe-deep-dive)
-   - [Tutorials](#tutorials)
-   - [Documentation](#documentation)
- **[Working with TFHE-rs](#working-with-tfhe-rs)**
-   - [Disclaimers](#disclaimers)
-   - [Citations](#citations)
-   - [Contributing](#contributing)
-   - [License](#license)
- **[Support](#support)**
-<br></br>

 ## Getting Started
+The steps to run a first example are described below. 

 ### Cargo.toml configuration
 To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
@@ -77,24 +47,20 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-un
 ```toml
 tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
 ```
+Note: users with ARM devices must compile `TFHE-rs` using a stable toolchain with version >= 1.72.

-+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
+
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) 
+running Windows:

 ```toml
 tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
 ```

-> [!Note]
-> Note: You need to use a Rust version >= 1.72 to compile TFHE-rs.
+Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs

-> [!Note]
-> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.

-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
-
-### A simple example
+## A simple example

 Here is a full example:

@@ -104,7 +70,9 @@ use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheUint32, FheUint8};

 fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Basic configuration to use homomorphic integers
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();

    // Key generation
    let (client_key, server_keys) = generate_keys(config);
@@ -151,64 +119,32 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
 To run this code, use the following command: 
 <p align="center"> <code> cargo run --release </code> </p>

-> [!Note]
-> Note that when running code that uses `TFHE-rs`, it is highly recommended
-to run in release mode with cargo's `--release` flag to have the best performances possible.
-
-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
-
-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
+Note that when running code that uses `tfhe-rs`, it is highly recommended
+to run in release mode with cargo's `--release` flag to have the best performances possible,


+## Contributing

-## Resources 
+There are two ways to contribute to TFHE-rs:

-### TFHE deep dive
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
-<br></br>
+- you can open issues to report bugs or typos, or to suggest new ideas
+- you can ask to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
+(becoming an approved contributor involves signing our Contributor License Agreement (CLA))

-### Tutorials
- [Homomorphic Parity Bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
- [Homomorphic Case Changing on Ascii String](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
- [Dark Market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
- [Regular Expression Engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
+Only approved contributors can send pull requests, so please make sure to get in touch before you do!

+## Credits

-*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
-<br></br>
-### Documentation
+This library uses several dependencies and we would like to thank the contributors of those
+libraries.

-Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
+## Need support?
+<a target="_blank" href="https://community.zama.ai">
+  <img src="https://user-images.githubusercontent.com/5758427/231115030-21195b55-2629-4c01-9809-be5059243999.png">
+</a>

+## Citing TFHE-rs

-## Working with TFHE-rs
-
-### Disclaimers
-
-#### Security Estimation
-
-Security estimations are done using the
-[Lattice Estimator](https://github.com/malb/lattice-estimator)
-with `red_cost_model = reduction.RC.BDGL16`.
-
-When a new update is published in the Lattice Estimator, we update parameters accordingly.
-
-#### Side-Channel Attacks
-
-Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
-and will be released in upcoming versions.
-<br></br>
-
-### Citations
 To cite TFHE-rs in academic papers, please use the following entry:

 ```text
@@ -220,31 +156,22 @@ To cite TFHE-rs in academic papers, please use the following entry:
 }
 ```

-### Contributing
+## License

-There are two ways to contribute to TFHE-rs:
+This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
+please contact us at `hello@zama.ai`.

- [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
+## Disclaimers

-Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
-<br></br>
+### Security Estimation

-### License
-This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
+Security estimations are done using the
+[Lattice Estimator](https://github.com/malb/lattice-estimator)
+with `red_cost_model = reduction.RC.BDGL16`.

+When a new update is published in the Lattice Estimator, we update parameters accordingly.

-## Support
+### Side-Channel Attacks

-<a target="_blank" href="https://community.zama.ai">
-  <img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/8da6cf5b-51a0-4c86-9e75-fd0e4a4c64a4">
-</a>
-
-🌟 If you find this project helpful or interesting, please consider giving it a star on GitHub! Your support helps to grow the community and motivates further development. 
-
-<p align="right">
-  <a href="#about" > ↑ Back to top </a> 
-</p>
+Mitigation for side channel attacks have not yet been implemented in TFHE-rs,
+and will be released in upcoming versions.
--- a/apps/trivium/benches/kreyvium_bool.rs
+++ b/apps/trivium/benches/kreyvium_bool.rs
@@ -6,7 +6,7 @@ use tfhe_trivium::KreyviumStream;
 use criterion::Criterion;

 pub fn kreyvium_bool_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled().enable_default_bool().build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -41,7 +41,7 @@ pub fn kreyvium_bool_gen(c: &mut Criterion) {
 }

 pub fn kreyvium_bool_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled().enable_default_bool().build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/kreyvium_byte.rs
+++ b/apps/trivium/benches/kreyvium_byte.rs
@@ -6,8 +6,9 @@ use tfhe_trivium::{KreyviumStreamByte, TransCiphering};
 use criterion::Criterion;

 pub fn kreyvium_byte_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .enable_function_evaluation_integers()
        .build();
    let (client_key, server_key) = generate_keys(config);

@@ -35,8 +36,9 @@ pub fn kreyvium_byte_gen(c: &mut Criterion) {
 }

 pub fn kreyvium_byte_trans(c: &mut Criterion) {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .enable_function_evaluation_integers()
        .build();
    let (client_key, server_key) = generate_keys(config);

@@ -65,8 +67,9 @@ pub fn kreyvium_byte_trans(c: &mut Criterion) {
 }

 pub fn kreyvium_byte_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .enable_function_evaluation_integers()
        .build();
    let (client_key, server_key) = generate_keys(config);

--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -8,7 +8,9 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};
 use criterion::Criterion;

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
@@ -58,7 +60,9 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
 }

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
@@ -103,7 +107,9 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
 }

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
--- a/apps/trivium/benches/trivium_bool.rs
+++ b/apps/trivium/benches/trivium_bool.rs
@@ -6,7 +6,7 @@ use tfhe_trivium::TriviumStream;
 use criterion::Criterion;

 pub fn trivium_bool_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled().enable_default_bool().build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -41,7 +41,7 @@ pub fn trivium_bool_gen(c: &mut Criterion) {
 }

 pub fn trivium_bool_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled().enable_default_bool().build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/trivium_byte.rs
+++ b/apps/trivium/benches/trivium_byte.rs
@@ -6,7 +6,9 @@ use tfhe_trivium::{TransCiphering, TriviumStreamByte};
 use criterion::Criterion;

 pub fn trivium_byte_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -33,7 +35,9 @@ pub fn trivium_byte_gen(c: &mut Criterion) {
 }

 pub fn trivium_byte_trans(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -61,7 +65,9 @@ pub fn trivium_byte_trans(c: &mut Criterion) {
 }

 pub fn trivium_byte_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -8,7 +8,9 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};
 use criterion::Criterion;

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
@@ -58,7 +60,9 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
 }

 pub fn trivium_shortint_gen(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
@@ -103,7 +107,9 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
 }

 pub fn trivium_shortint_trans(c: &mut Criterion) {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -170,7 +170,7 @@ fn kreyvium_test_4() {

 #[test]
 fn kreyvium_test_fhe_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled().enable_default_bool().build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -217,7 +217,9 @@ use tfhe::shortint::prelude::*;

 #[test]
 fn kreyvium_test_shortint_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
@@ -300,8 +302,9 @@ fn kreyvium_test_clear_byte() {

 #[test]
 fn kreyvium_test_byte_long() {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .enable_function_evaluation_integers()
        .build();
    let (client_key, server_key) = generate_keys(config);

@@ -339,8 +342,9 @@ fn kreyvium_test_byte_long() {

 #[test]
 fn kreyvium_test_fhe_byte_transciphering_long() {
-    let config = ConfigBuilder::default()
-        .enable_function_evaluation()
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .enable_function_evaluation_integers()
        .build();
    let (client_key, server_key) = generate_keys(config);

--- a/apps/trivium/src/trans_ciphering/mod.rs
+++ b/apps/trivium/src/trans_ciphering/mod.rs
@@ -4,7 +4,6 @@
 use crate::{KreyviumStreamByte, KreyviumStreamShortint, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::shortint::Ciphertext;

-use tfhe::prelude::*;
 use tfhe::{set_server_key, unset_server_key, FheUint64, FheUint8, ServerKey};

 use rayon::prelude::*;
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -232,7 +232,7 @@ fn trivium_test_clear_byte() {

 #[test]
 fn trivium_test_fhe_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled().enable_default_bool().build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -277,7 +277,9 @@ fn trivium_test_fhe_long() {

 #[test]
 fn trivium_test_fhe_byte_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -314,7 +316,9 @@ fn trivium_test_fhe_byte_long() {

 #[test]
 fn trivium_test_fhe_byte_transciphering_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (client_key, server_key) = generate_keys(config);

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -353,7 +357,9 @@ use tfhe::shortint::prelude::*;

 #[test]
 fn trivium_test_shortint_long() {
-    let config = ConfigBuilder::default().build();
+    let config = ConfigBuilder::all_disabled()
+        .enable_default_integers()
+        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,18 +0,0 @@
-[package]
-name = "tfhe-cuda-backend"
-version = "0.1.2"
-edition = "2021"
-authors = ["Zama team"]
-license = "BSD-3-Clause-Clear"
-description = "Cuda implementation of TFHE-rs primitives."
-homepage = "https://www.zama.ai/"
-documentation = "https://docs.zama.ai/tfhe-rs"
-repository = "https://github.com/zama-ai/tfhe-rs"
-readme = "README.md"
-keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
-
-[build-dependencies]
-cmake = { version = "0.1" }
-
-[dependencies]
-thiserror = "1.0"
--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -1,28 +0,0 @@
-BSD 3-Clause Clear License
-
-Copyright © 2024 ZAMA.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice, this
-list of conditions and the following disclaimer in the documentation and/or other
-materials provided with the distribution.
-
-3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
-or promote products derived from this software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
-THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
-ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
-OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -1,52 +0,0 @@
-# TFHE Cuda backend
-
-## Introduction
-
-The `tfhe-cuda-backend` holds the code for GPU acceleration of Zama's variant of TFHE.
-It implements CUDA/C++ functions to perform homomorphic operations on LWE ciphertexts.
-
-It provides functions to allocate memory on the GPU, to copy data back 
-and forth between the CPU and the GPU, to create and destroy Cuda streams, etc.:
- `cuda_create_stream`, `cuda_destroy_stream`
- `cuda_malloc`, `cuda_check_valid_malloc`
- `cuda_memcpy_async_to_cpu`, `cuda_memcpy_async_to_gpu`
- `cuda_get_number_of_gpus`
- `cuda_synchronize_device`
-The cryptographic operations it provides are:
- an amortized implementation of the TFHE programmable bootstrap: `cuda_bootstrap_amortized_lwe_ciphertext_vector_32` and `cuda_bootstrap_amortized_lwe_ciphertext_vector_64`
- a low latency implementation of the TFHE programmable bootstrap: `cuda_bootstrap_low latency_lwe_ciphertext_vector_32` and `cuda_bootstrap_low_latency_lwe_ciphertext_vector_64`
- the keyswitch: `cuda_keyswitch_lwe_ciphertext_vector_32` and `cuda_keyswitch_lwe_ciphertext_vector_64`
- the larger precision programmable bootstrap (wop PBS, which supports up to 16 bits of message while the classical PBS only supports up to 8 bits of message) and its sub-components: `cuda_wop_pbs_64`, `cuda_extract_bits_64`, `cuda_circuit_bootstrap_64`, `cuda_cmux_tree_64`, `cuda_blind_rotation_sample_extraction_64`
- acceleration for leveled operations: `cuda_negate_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_plaintext_vector_64`, `cuda_mult_lwe_ciphertext_vector_cleartext_vector`.
-
-## Dependencies
-
-**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported. 
-
- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
- [cmake](https://cmake.org/) >= 3.24
-
-## Build
-
-The Cuda project held in `tfhe-cuda-backend` can be compiled independently from TFHE-rs in the following way:
-```
-git clone git@github.com:zama-ai/tfhe-rs
-cd backends/tfhe-cuda-backend/cuda
-mkdir build
-cd build
-cmake ..
-make
-```
-The compute capability is detected automatically (with the first GPU information) and set accordingly.
-If your machine does not have an available Nvidia GPU, the compilation will work if you have the nvcc compiler installed. The generated executable will target a 7.0 compute capability (sm_70).
-
-## Links
-
- [TFHE](https://eprint.iacr.org/2018/421.pdf)
-
-## License
-
-This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
-please contact us at `hello@zama.ai`.
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -1,28 +0,0 @@
-use std::env;
-use std::process::Command;
-
-fn main() {
-    println!("Build tfhe-cuda-backend");
-    if env::consts::OS == "linux" {
-        let output = Command::new("./get_os_name.sh").output().unwrap();
-        let distribution = String::from_utf8(output.stdout).unwrap();
-        if distribution != "Ubuntu\n" {
-            println!(
-                "cargo:warning=This Linux distribution is not officially supported. \
-                Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
-            );
-        }
-        let dest = cmake::build("cuda");
-        println!("cargo:rustc-link-search=native={}", dest.display());
-        println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
-        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-        println!("cargo:rustc-link-lib=gomp");
-        println!("cargo:rustc-link-lib=cudart");
-        println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
-        println!("cargo:rustc-link-lib=stdc++");
-    } else {
-        panic!(
-            "Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"
-        );
-    }
-}
--- a/backends/tfhe-cuda-backend/cuda/.cmake-format-config.py
+++ b/backends/tfhe-cuda-backend/cuda/.cmake-format-config.py
@@ -1,10 +0,0 @@
-# -----------------------------
-# Options effecting formatting.
-# -----------------------------
-with section("format"):
-
-  # How wide to allow formatted cmake files
-  line_width = 120
-  
-  # How many spaces to tab for indent
-  tab_size = 2
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -1,90 +0,0 @@
-cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
-project(tfhe_cuda_backend LANGUAGES CXX)
-
-# See if the minimum CUDA version is available. If not, only enable documentation building.
-set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
-include(CheckLanguage)
-# See if CUDA is available
-check_language(CUDA)
-# If so, enable CUDA to check the version.
-if(CMAKE_CUDA_COMPILER)
-  enable_language(CUDA)
-endif()
-# If CUDA is not available, or the minimum version is too low do not build
-if(NOT CMAKE_CUDA_COMPILER)
-  message(FATAL_ERROR "Cuda compiler not found.")
-endif()
-
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
-  message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
-endif()
-# Get CUDA compute capability
-set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
-set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
-execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
-execute_process(
-  COMMAND ${OUTPUTFILE}
-  RESULT_VARIABLE CUDA_RETURN_CODE
-  OUTPUT_VARIABLE ARCH)
-file(REMOVE ${OUTPUTFILE})
-
-if(${CUDA_RETURN_CODE} EQUAL 0)
-  set(CUDA_SUCCESS "TRUE")
-else()
-  set(CUDA_SUCCESS "FALSE")
-endif()
-
-if(${CUDA_SUCCESS})
-  message(STATUS "CUDA Architecture: ${ARCH}")
-  message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
-  message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
-  message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
-  message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
-else()
-  message(WARNING ${ARCH})
-endif()
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-# Add OpenMP support
-find_package(OpenMP REQUIRED)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -g")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
-if(${CUDA_SUCCESS})
-  set(CMAKE_CUDA_ARCHITECTURES native)
-else()
-  set(CMAKE_CUDA_ARCHITECTURES 70)
-endif()
-
-# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
-set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
-  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
-  --use_fast_math -Xcompiler -fPIC")
-
-set(INCLUDE_DIR include)
-
-add_subdirectory(src)
-target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
-
-# This is required for rust cargo build
-install(TARGETS tfhe_cuda_backend DESTINATION .)
-install(TARGETS tfhe_cuda_backend DESTINATION lib)
-
-# Define a function to add a lint target.
-find_file(CPPLINT NAMES cpplint cpplint.exe)
-if(CPPLINT)
-  # Add a custom target to lint all child projects. Dependencies are specified in child projects.
-  add_custom_target(all_lint)
-  # Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
-  set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
-  # set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
-endif()
-
-enable_testing()
--- a/backends/tfhe-cuda-backend/cuda/CPPLINT.cfg
+++ b/backends/tfhe-cuda-backend/cuda/CPPLINT.cfg
@@ -1,3 +0,0 @@
-set noparent 
-linelength=240
-filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17
--- a/backends/tfhe-cuda-backend/cuda/check_cuda.cu
+++ b/backends/tfhe-cuda-backend/cuda/check_cuda.cu
@@ -1,22 +0,0 @@
-#include <stdio.h>
-
-int main(int argc, char **argv) {
-  cudaDeviceProp dP;
-  float min_cc = 3.0;
-
-  int rc = cudaGetDeviceProperties(&dP, 0);
-  if (rc != cudaSuccess) {
-    cudaError_t error = cudaGetLastError();
-    printf("CUDA error: %s", cudaGetErrorString(error));
-    return rc; /* Failure */
-  }
-  if ((dP.major + (dP.minor / 10)) < min_cc) {
-    printf("Min Compute Capability of %2.1f required:  %d.%d found\n Not "
-           "Building CUDA Code",
-           min_cc, dP.major, dP.minor);
-    return 1; /* Failure */
-  } else {
-    printf("-arch=sm_%d%d", dP.major, dP.minor);
-    return 0; /* Success */
-  }
-}
--- a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
+++ b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-set -e
-
-while getopts ":c" option; do
-  case $option in
-    c)
-      # code to execute when flag1 is provided
-      find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
-      cmake-format -i CMakeLists.txt -c .cmake-format-config.py
-      find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
-      git diff --exit-code
-      exit
-      ;;
-  esac
-done
-find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
-cmake-format -i CMakeLists.txt -c .cmake-format-config.py
-find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
@@ -1,118 +0,0 @@
-#ifndef CUDA_BOOTSTRAP_H
-#define CUDA_BOOTSTRAP_H
-
-#include "device.h"
-#include <cstdint>
-
-enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
-
-extern "C" {
-void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
-                                 cuda_stream_t *stream,
-                                 uint32_t polynomial_size,
-                                 uint32_t total_polynomials);
-
-void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
-                                       cuda_stream_t *stream,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size);
-
-void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
-                                       cuda_stream_t *stream,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size);
-
-void scratch_cuda_bootstrap_amortized_32(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void scratch_cuda_bootstrap_amortized_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
-                                      int8_t **pbs_buffer);
-
-void scratch_cuda_bootstrap_low_latency_32(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_bootstrap_low_latency_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
-
-void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
-                                        int8_t **pbs_buffer);
-
-uint64_t get_buffer_size_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-
-uint64_t get_buffer_size_bootstrap_low_latency_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-}
-
-#ifdef __CUDACC__
-__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
-                                         int glwe_dimension,
-                                         uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
-
-#endif
-
-#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
@@ -1,46 +0,0 @@
-#ifndef CUDA_MULTI_BIT_H
-#define CUDA_MULTI_BIT_H
-
-#include <cstdint>
-
-extern "C" {
-void cuda_convert_lwe_multi_bit_bootstrap_key_64(
-    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
-    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
-    uint32_t grouping_factor);
-
-void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t chunk_size = 0);
-
-void scratch_cuda_multi_bit_pbs_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t chunk_size = 0);
-
-void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
-}
-#ifdef __CUDACC__
-__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
-                                     uint32_t level_count,
-                                     uint32_t glwe_dimension,
-                                     uint32_t num_samples);
-
-__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
-                                             uint32_t level_count,
-                                             uint32_t glwe_dimension,
-                                             uint32_t ct_count);
-
-__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
-#endif
-
-#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -1,18 +0,0 @@
-#ifndef CUDA_CIPHERTEXT_H
-#define CUDA_CIPHERTEXT_H
-
-#include <cstdint>
-
-extern "C" {
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  void *v_stream,
-                                                  uint32_t gpu_index,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension);
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  void *v_stream,
-                                                  uint32_t gpu_index,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension);
-};
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -1,92 +0,0 @@
-#ifndef DEVICE_H
-#define DEVICE_H
-
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cuda_runtime.h>
-
-#define synchronize_threads_in_block() __syncthreads()
-
-extern "C" {
-
-#define check_cuda_error(ans)                                                  \
-  { cuda_error((ans), __FILE__, __LINE__); }
-inline void cuda_error(cudaError_t code, const char *file, int line) {
-  if (code != cudaSuccess) {
-    std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
-                 file, line);
-    std::abort();
-  }
-}
-#define PANIC(format, ...)                                                     \
-  {                                                                            \
-    std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
-                 __LINE__, __func__, ##__VA_ARGS__);                           \
-    std::abort();                                                              \
-  }
-
-struct cuda_stream_t {
-  cudaStream_t stream;
-  uint32_t gpu_index;
-
-  cuda_stream_t(uint32_t gpu_index) {
-    this->gpu_index = gpu_index;
-
-    check_cuda_error(cudaStreamCreate(&stream));
-  }
-
-  void release() {
-    check_cuda_error(cudaSetDevice(gpu_index));
-    check_cuda_error(cudaStreamDestroy(stream));
-  }
-
-  void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
-};
-
-cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
-
-void cuda_destroy_stream(cuda_stream_t *stream);
-
-void *cuda_malloc(uint64_t size, uint32_t gpu_index);
-
-void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
-
-void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
-
-bool cuda_check_support_cooperative_groups();
-
-void cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
-
-void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                              cuda_stream_t *stream);
-
-void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                  cuda_stream_t *stream);
-
-void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
-
-void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                              cuda_stream_t *stream);
-
-void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                       cuda_stream_t *stream);
-
-int cuda_get_number_of_gpus();
-
-void cuda_synchronize_device(uint32_t gpu_index);
-
-void cuda_drop(void *ptr, uint32_t gpu_index);
-
-void cuda_drop_async(void *ptr, cuda_stream_t *stream);
-
-int cuda_get_max_shared_memory(uint32_t gpu_index);
-
-void cuda_synchronize_stream(cuda_stream_t *stream);
-}
-
-template <typename Torus>
-void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
-                          Torus n);
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh
@@ -1,100 +0,0 @@
-#include "cuComplex.h"
-#include "thrust/complex.h"
-#include <iostream>
-#include <string>
-#include <type_traits>
-
-#define PRINT_VARS
-#ifdef PRINT_VARS
-#define PRINT_DEBUG_5(var, begin, end, step, cond)                             \
-  _print_debug(var, #var, begin, end, step, cond, "", false)
-#define PRINT_DEBUG_6(var, begin, end, step, cond, text)                       \
-  _print_debug(var, #var, begin, end, step, cond, text, true)
-#define CAT(A, B) A##B
-#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
-#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
-#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
-#define PRINT_DEBUG(...)                                                       \
-  PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
-#else
-#define PRINT_DEBUG(...)
-#endif
-
-template <typename T>
-__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
-_print_debug(T *var, const char *var_name, int start, int end, int step,
-             bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %u\n", var_name, i, var[i]);
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
-_print_debug(T *var, const char *var_name, int start, int end, int step,
-             bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %d\n", var_name, i, var[i]);
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
-_print_debug(T *var, const char *var_name, int start, int end, int step,
-             bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %.15f\n", var_name, i, var[i]);
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__
-    typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
-                            void>::type
-    _print_debug(T *var, const char *var_name, int start, int end, int step,
-                 bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
-             var[i].imag());
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__
-    typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
-    _print_debug(T *var, const char *var_name, int start, int end, int step,
-                 bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
-    }
-  }
-  __syncthreads();
-}
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -1,21 +0,0 @@
-#ifndef CNCRT_KS_H_
-#define CNCRT_KS_H_
-
-#include <cstdint>
-
-extern "C" {
-
-void cuda_keyswitch_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
-
-void cuda_keyswitch_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples);
-}
-
-#endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -1,50 +0,0 @@
-#ifndef CUDA_LINALG_H_
-#define CUDA_LINALG_H_
-
-#include "bootstrap.h"
-#include <cstdint>
-#include <device.h>
-
-extern "C" {
-
-void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
-                                       void *lwe_array_out,
-                                       void *lwe_array_in_1,
-                                       void *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
-                                       void *lwe_array_out,
-                                       void *lwe_array_in_1,
-                                       void *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    void *plaintext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    void *plaintext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    void *cleartext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    void *cleartext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-}
-
-#endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,18 +0,0 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
-file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES})
-set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
-target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1 +0,0 @@
-#include "ciphertext.cuh"
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -1,44 +0,0 @@
-#ifndef CUDA_CIPHERTEXT_CUH
-#define CUDA_CIPHERTEXT_CUH
-
-#include "ciphertext.h"
-#include "device.h"
-#include <cstdint>
-
-template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
-                                               cuda_stream_t *stream,
-                                               uint32_t number_of_cts,
-                                               uint32_t lwe_dimension) {
-  cudaSetDevice(stream->gpu_index);
-  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_gpu(dest, src, size, stream);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  cuda_stream_t *stream,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
-}
-
-template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
-                                               cuda_stream_t *stream,
-                                               uint32_t number_of_cts,
-                                               uint32_t lwe_dimension) {
-  cudaSetDevice(stream->gpu_index);
-  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_cpu(dest, src, size, stream);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  cuda_stream_t *stream,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -1,162 +0,0 @@
-#ifndef CNCRT_CRYPTO_CUH
-#define CNCRT_CRPYTO_CUH
-
-#include "device.h"
-#include <cstdint>
-
-/**
- * GadgetMatrix implements the iterator design pattern to decompose a set of
- * num_poly consecutive polynomials with degree params::degree. A total of
- * level_count levels is expected and each call to decompose_and_compress_next()
- * writes to the result the next level. It is also possible to advance an
- * arbitrary amount of levels by using decompose_and_compress_level().
- *
- * This class always decomposes the entire set of num_poly polynomials.
- * By default, it works on a single polynomial.
- */
-#pragma once
-template <typename T, class params> class GadgetMatrix {
-private:
-  uint32_t level_count;
-  uint32_t base_log;
-  uint32_t mask;
-  uint32_t halfbg;
-  uint32_t num_poly;
-  T offset;
-  int current_level;
-  T mask_mod_b;
-  T *state;
-
-public:
-  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
-                          uint32_t num_poly = 1)
-      : base_log(base_log), level_count(level_count), num_poly(num_poly),
-        state(state) {
-
-    mask_mod_b = (1ll << base_log) - 1ll;
-    current_level = level_count;
-    int tid = threadIdx.x;
-    for (int i = 0; i < num_poly * params::opt; i++) {
-      state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
-  // Decomposes all polynomials at once
-  __device__ void decompose_and_compress_next(double2 *result) {
-    for (int j = 0; j < num_poly; j++) {
-      auto result_slice = result + j * params::degree / 2;
-      decompose_and_compress_next_polynomial(result_slice, j);
-    }
-  }
-
-  // Decomposes a single polynomial
-  __device__ void decompose_and_compress_next_polynomial(double2 *result,
-                                                         int j) {
-    if (j == 0)
-      current_level -= 1;
-
-    int tid = threadIdx.x;
-    auto state_slice = state + j * params::degree;
-    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state_slice[tid] & mask_mod_b;
-      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
-      state_slice[tid] >>= base_log;
-      state_slice[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
-      T carry_im =
-          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
-      carry_re >>= (base_log - 1);
-      carry_im >>= (base_log - 1);
-      state_slice[tid] += carry_re;
-      state_slice[tid + params::degree / 2] += carry_im;
-      res_re -= carry_re << base_log;
-      res_im -= carry_im << base_log;
-
-      result[tid].x = (int32_t)res_re;
-      result[tid].y = (int32_t)res_im;
-
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
-  // Decomposes a single polynomial
-  __device__ void
-  decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
-    if (j == 0)
-      current_level -= 1;
-
-    int tid = threadIdx.x;
-    auto state_slice = state + j * params::degree;
-    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state_slice[tid] & mask_mod_b;
-      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
-      state_slice[tid] >>= base_log;
-      state_slice[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
-      T carry_im =
-          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
-      carry_re >>= (base_log - 1);
-      carry_im >>= (base_log - 1);
-      state_slice[tid] += carry_re;
-      state_slice[tid + params::degree / 2] += carry_im;
-      res_re -= carry_re << base_log;
-      res_im -= carry_im << base_log;
-
-      result[i].x = (int32_t)res_re;
-      result[i].y = (int32_t)res_im;
-
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
-  __device__ void decompose_and_compress_level(double2 *result, int level) {
-    for (int i = 0; i < level_count - level; i++)
-      decompose_and_compress_next(result);
-  }
-};
-
-template <typename T> class GadgetMatrixSingle {
-private:
-  uint32_t level_count;
-  uint32_t base_log;
-  uint32_t mask;
-  uint32_t halfbg;
-  T offset;
-
-public:
-  __device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
-      : base_log(base_log), level_count(level_count) {
-    uint32_t bg = 1 << base_log;
-    this->halfbg = bg / 2;
-    this->mask = bg - 1;
-    T temp = 0;
-    for (int i = 0; i < this->level_count; i++) {
-      temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
-    }
-    this->offset = temp * this->halfbg;
-  }
-
-  __device__ T decompose_one_level_single(T element, uint32_t level) {
-    T s = element + this->offset;
-    uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
-    T temp1 = (s >> decal) & this->mask;
-    return (T)(temp1 - this->halfbg);
-  }
-};
-
-template <typename Torus>
-__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
-  Torus res = state & mask_mod_b;
-  state >>= base_log;
-  Torus carry = ((res - 1ll) | state) & res;
-  carry >>= base_log - 1;
-  state += carry;
-  res -= carry << base_log;
-  return res;
-}
-
-#endif // CNCRT_CRPYTO_H
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -1,74 +0,0 @@
-#ifndef CNCRT_GGSW_CUH
-#define CNCRT_GGSW_CUH
-
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "polynomial/parameters.cuh"
-
-template <typename T, typename ST, class params, sharedMemDegree SMD>
-__global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
-                                             int8_t *device_mem) {
-
-  extern __shared__ int8_t sharedmem[];
-  double2 *selected_memory;
-
-  if constexpr (SMD == FULLSM)
-    selected_memory = (double2 *)sharedmem;
-  else
-    selected_memory = (double2 *)device_mem[blockIdx.x * params::degree];
-
-  // Compression
-  int offset = blockIdx.x * blockDim.x;
-
-  int tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    ST x = src[(tid) + params::opt * offset];
-    ST y = src[(tid + params::degree / 2) + params::opt * offset];
-    selected_memory[tid].x = x / (double)std::numeric_limits<T>::max();
-    selected_memory[tid].y = y / (double)std::numeric_limits<T>::max();
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(selected_memory);
-  synchronize_threads_in_block();
-
-  // Write the output to global memory
-  tid = threadIdx.x;
-#pragma unroll
-  for (int j = 0; j < params::opt / 2; j++) {
-    dest[tid + (params::opt >> 1) * offset] = selected_memory[tid];
-    tid += params::degree / params::opt;
-  }
-}
-
-/**
- * Applies the FFT transform on sequence of GGSW ciphertexts already in the
- * global memory
- */
-template <typename T, typename ST, class params>
-void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
-                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
-                           uint32_t polynomial_size, uint32_t level_count,
-                           uint32_t gpu_index, uint32_t max_shared_memory) {
-  cudaSetDevice(stream->gpu_index);
-
-  int shared_memory_size = sizeof(double) * polynomial_size;
-
-  int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
-  int blockSize = polynomial_size / params::opt;
-
-  if (max_shared_memory < shared_memory_size) {
-    device_batch_fft_ggsw_vector<T, ST, params, NOSM>
-        <<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
-  } else {
-    device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
-        <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
-                                                                      d_mem);
-  }
-  check_cuda_error(cudaGetLastError());
-}
-
-#endif // CNCRT_GGSW_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -1,48 +0,0 @@
-#include "keyswitch.cuh"
-#include "keyswitch.h"
-#include <cstdint>
-
-/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
- * Head out to the equivalent operation on 64 bits for more details.
- */
-void cuda_keyswitch_lwe_ciphertext_vector_32(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, static_cast<uint32_t *>(lwe_array_out),
-      static_cast<uint32_t *>(lwe_output_indexes),
-      static_cast<uint32_t *>(lwe_array_in),
-      static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
-}
-
-/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
- *
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
- * (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
- * lwe_dimension_in mask values + 1 body value
- *  - ksk: the keyswitch key to be used in the operation
- *  - base log: the log of the base used in the decomposition (should be the one
- * used to create the ksk)
- *
- * This function calls a wrapper to a device kernel that performs the keyswitch
- * 	- num_samples blocks of threads are launched
- */
-void cuda_keyswitch_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_output_indexes),
-      static_cast<uint64_t *>(lwe_array_in),
-      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -1,144 +0,0 @@
-#ifndef CNCRT_KS_CUH
-#define CNCRT_KS_CUH
-
-#include "device.h"
-#include "gadget.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "torus.cuh"
-#include <thread>
-#include <vector>
-
-template <typename Torus>
-__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
-                                uint32_t lwe_dimension_out,
-                                uint32_t level_count) {
-  int pos = i * level_count * (lwe_dimension_out + 1) +
-            level * (lwe_dimension_out + 1);
-  Torus *ptr = &ksk[pos];
-  return ptr;
-}
-
-/*
- * keyswitch kernel
- * Each thread handles a piece of the following equation:
- * $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
- * (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
- * the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
- * equation is solved for each polynomial coefficient. where Dec denotes the
- * decomposition with base beta and l levels and the inner product is done
- * between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
- * with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
- * scaling factor) under key s2 instead of s1, with an increased noise
- *
- */
-template <typename Torus>
-__global__ void
-keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
-          Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
-          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-          int lwe_lower, int lwe_upper, int cutoff) {
-  int tid = threadIdx.x;
-
-  extern __shared__ int8_t sharedmem[];
-
-  Torus *local_lwe_array_out = (Torus *)sharedmem;
-
-  auto block_lwe_array_in = get_chunk(
-      lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
-  auto block_lwe_array_out = get_chunk(
-      lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
-
-  auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
-
-  int lwe_part_per_thd;
-  if (tid < cutoff) {
-    lwe_part_per_thd = lwe_upper;
-  } else {
-    lwe_part_per_thd = lwe_lower;
-  }
-  __syncthreads();
-
-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    local_lwe_array_out[idx] = 0;
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    local_lwe_array_out[lwe_dimension_out] =
-        block_lwe_array_in[lwe_dimension_in];
-  }
-
-  for (int i = 0; i < lwe_dimension_in; i++) {
-
-    __syncthreads();
-
-    Torus a_i =
-        round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
-
-    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
-    Torus mask_mod_b = (1ll << base_log) - 1ll;
-
-    for (int j = 0; j < level_count; j++) {
-      auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
-      Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-      for (int k = 0; k < lwe_part_per_thd; k++) {
-        int idx = tid + k * blockDim.x;
-        local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
-      }
-    }
-  }
-
-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    block_lwe_array_out[idx] = local_lwe_array_out[idx];
-  }
-}
-
-/// assume lwe_array_in in the gpu
-template <typename Torus>
-__host__ void cuda_keyswitch_lwe_ciphertext_vector(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
-    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples) {
-
-  cudaSetDevice(stream->gpu_index);
-  constexpr int ideal_threads = 128;
-
-  int lwe_dim = lwe_dimension_out + 1;
-  int lwe_lower, lwe_upper, cutoff;
-  if (lwe_dim % ideal_threads == 0) {
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = lwe_dim / ideal_threads;
-    cutoff = 0;
-  } else {
-    int y =
-        ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
-    cutoff = ideal_threads - y;
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
-  }
-
-  int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
-
-  int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
-
-  cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
-  check_cuda_error(cudaGetLastError());
-
-  dim3 grid(num_samples, 1, 1);
-  dim3 threads(ideal_threads, 1, 1);
-
-  //    cudaFuncSetAttribute(keyswitch<Torus>,
-  //                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-  //                         shared_mem);
-
-  keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
-      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
-      lwe_upper, cutoff);
-  check_cuda_error(cudaGetLastError());
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -1,74 +0,0 @@
-#ifndef CNCRT_TORUS_CUH
-#define CNCRT_TORUS_CUH
-
-#include "types/int128.cuh"
-#include <limits>
-
-template <typename T>
-__device__ inline void typecast_double_to_torus(double x, T &r) {
-  r = T(x);
-}
-
-template <>
-__device__ inline void typecast_double_to_torus<uint32_t>(double x,
-                                                          uint32_t &r) {
-  r = __double2uint_rn(x);
-}
-
-template <>
-__device__ inline void typecast_double_to_torus<uint64_t>(double x,
-                                                          uint64_t &r) {
-  // The ull intrinsic does not behave in the same way on all architectures and
-  // on some platforms this causes the cmux tree test to fail
-  // Hence the intrinsic is not used here
-  uint128 nnnn = make_uint128_from_float(x);
-  uint64_t lll = nnnn.lo_;
-  r = lll;
-}
-
-template <typename T>
-__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
-                                              uint32_t level_count) {
-  T shift = sizeof(T) * 8 - level_count * base_log;
-  T mask = 1ll << (shift - 1);
-  T b = (x & mask) >> (shift - 1);
-  T res = x >> shift;
-  res += b;
-  res <<= shift;
-  return res;
-}
-
-template <typename T>
-__device__ __forceinline__ void rescale_torus_element(T element, T &output,
-                                                      uint32_t log_shift) {
-  output =
-      round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
-            (double)log_shift);
-}
-
-template <typename T>
-__device__ __forceinline__ T rescale_torus_element(T element,
-                                                   uint32_t log_shift) {
-  return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
-               (double)log_shift);
-}
-
-template <>
-__device__ __forceinline__ void
-rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
-                                uint32_t log_shift) {
-  output =
-      round(__uint2double_rn(element) /
-            (__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
-            __uint2double_rn(log_shift));
-}
-
-template <>
-__device__ __forceinline__ void
-rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
-                                uint32_t log_shift) {
-  output = round(__ull2double_rn(element) /
-                 (__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
-                 __uint2double_rn(log_shift));
-}
-#endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -1,254 +0,0 @@
-#include "device.h"
-#include <cstdint>
-#include <cuda_runtime.h>
-
-/// Unsafe function to create a CUDA stream, must check first that GPU exists
-cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  cuda_stream_t *stream = new cuda_stream_t(gpu_index);
-  return stream;
-}
-
-/// Unsafe function to destroy CUDA stream, must check first the GPU exists
-void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
-
-/// Unsafe function that will try to allocate even if gpu_index is invalid
-/// or if there's not enough memory. A safe wrapper around it must call
-/// cuda_check_valid_malloc() first
-void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  void *ptr;
-  check_cuda_error(cudaMalloc((void **)&ptr, size));
-
-  return ptr;
-}
-
-/// Allocates a size-byte array at the device memory. Tries to do it
-/// asynchronously.
-void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-  void *ptr;
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11020)
-  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
-                                          cudaDevAttrMemoryPoolsSupported,
-                                          stream->gpu_index));
-
-  if (support_async_alloc) {
-    check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
-  } else {
-    check_cuda_error(cudaMalloc((void **)&ptr, size));
-  }
-#else
-  check_cuda_error(cudaMalloc((void **)&ptr, size));
-#endif
-  return ptr;
-}
-
-/// Check that allocation is valid
-void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  size_t total_mem, free_mem;
-  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
-  if (size > free_mem) {
-    PANIC("Cuda error: not enough memory on device. "
-          "Available: %zu vs Requested: %lu",
-          free_mem, size);
-  }
-}
-
-/// Returns
-///  false if Cooperative Groups is not supported.
-///  true otherwise
-bool cuda_check_support_cooperative_groups() {
-  int cooperative_groups_supported = 0;
-  check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
-                                          cudaDevAttrCooperativeLaunch, 0));
-
-  return cooperative_groups_supported > 0;
-}
-
-/// Copy memory to the GPU asynchronously
-void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                              cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in async copy to GPU.");
-  }
-
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
-}
-
-/// Copy memory to the GPU synchronously
-void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in copy to GPU.");
-  }
-  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
-}
-
-/// Copy memory to the CPU synchronously
-void cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid device pointer in copy to CPU.");
-  }
-  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
-}
-
-/// Copy memory within a GPU asynchronously
-void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
-                                  cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr_dest;
-  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.device != stream->gpu_index &&
-      attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.");
-  }
-  cudaPointerAttributes attr_src;
-  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.device != stream->gpu_index &&
-      attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.");
-  }
-  if (attr_src.device != attr_dest.device) {
-    PANIC("Cuda error: different devices specified in copy from GPU to GPU.");
-  }
-
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-  check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
-                                   stream->stream));
-}
-
-/// Synchronizes device
-void cuda_synchronize_device(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  check_cuda_error(cudaDeviceSynchronize());
-}
-
-void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                       cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda memset.");
-  }
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-  check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
-}
-
-template <typename Torus>
-__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < n)
-    array[index] = value;
-}
-
-template <typename Torus>
-void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
-                          Torus n) {
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
-  if (attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in cuda set value.");
-  }
-  int block_size = 256;
-  int num_blocks = (n + block_size - 1) / block_size;
-
-  // Launch the kernel
-  cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
-                                                                n);
-  check_cuda_error(cudaGetLastError());
-}
-
-/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
-template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
-                                   uint64_t value, uint64_t n);
-template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
-                                   uint32_t value, uint32_t n);
-
-/// Copy memory to the CPU asynchronously
-void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                              cuda_stream_t *stream) {
-  if (size == 0)
-    return;
-  cudaPointerAttributes attr;
-  check_cuda_error(cudaPointerGetAttributes(&attr, src));
-  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy to CPU async.");
-  }
-
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
-}
-
-/// Return number of GPUs available
-int cuda_get_number_of_gpus() {
-  int num_gpus;
-  check_cuda_error(cudaGetDeviceCount(&num_gpus));
-  return num_gpus;
-}
-
-/// Drop a cuda array
-void cuda_drop(void *ptr, uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  check_cuda_error(cudaFree(ptr));
-}
-
-/// Drop a cuda array asynchronously, if supported on the device
-void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
-
-  check_cuda_error(cudaSetDevice(stream->gpu_index));
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11020)
-  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
-                                          cudaDevAttrMemoryPoolsSupported,
-                                          stream->gpu_index));
-
-  if (support_async_alloc) {
-    check_cuda_error(cudaFreeAsync(ptr, stream->stream));
-  } else {
-    check_cuda_error(cudaFree(ptr));
-  }
-#else
-  check_cuda_error(cudaFree(ptr));
-#endif
-}
-
-/// Get the maximum size for the shared memory
-int cuda_get_max_shared_memory(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-  cudaDeviceProp prop;
-  check_cuda_error(cudaGetDeviceProperties(&prop, gpu_index));
-  int max_shared_memory = 0;
-  if (prop.major >= 6) {
-    max_shared_memory = prop.sharedMemPerMultiprocessor;
-  } else {
-    max_shared_memory = prop.sharedMemPerBlock;
-  }
-  return max_shared_memory;
-}
-
-void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -1,725 +0,0 @@
-#ifndef GPU_BOOTSTRAP_FFT_CUH
-#define GPU_BOOTSTRAP_FFT_CUH
-
-#include "polynomial/functions.cuh"
-#include "polynomial/parameters.cuh"
-#include "twiddles.cuh"
-#include "types/complex/operations.cuh"
-
-/*
- * Direct negacyclic FFT:
- *   - before the FFT the N real coefficients are stored into a
- *     N/2 sized complex with the even coefficients in the real part
- *     and the odd coefficients in the imaginary part. This is referred to
- *     as the half-size FFT
- *   - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
- *     opt is divided by 2 because the butterfly pattern is always applied
- *     between pairs of coefficients
- *   - instead of twisting each coefficient A_j before the FFT by
- *     multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
- *     the FFT is modified, and for each level k of the FFT the twiddle:
- *     w_j,k = exp(-i pi j/2^k)
- *     is replaced with:
- *     \zeta_j,k = exp(-i pi (2j-1)/2^k)
- */
-template <class params> __device__ void NSMFFT_direct(double2 *A) {
-
-  /* We don't make bit reverse here, since twiddles are already reversed
-   *  Each thread is always in charge of "opt/2" pairs of coefficients,
-   *  which is why we always loop through N/2 by N/opt strides
-   *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance
-   */
-
-  size_t tid = threadIdx.x;
-  size_t twid_id;
-  size_t i1, i2;
-  double2 u, v, w;
-  // level 1
-  // we don't make actual complex multiplication on level1 since we have only
-  // one twiddle, it's real and image parts are equal, so we can multiply
-  // it with simpler operations
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    i1 = tid;
-    i2 = tid + params::degree / 2;
-
-    u = A[i1];
-    v = A[i2] * (double2){0.707106781186547461715008466854,
-                          0.707106781186547461715008466854};
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 2
-  // from this level there are more than one twiddles and none of them has equal
-  // real and imag parts, so complete complex multiplication is needed
-  // for each level params::degree / 2^level represents number of coefficients
-  // inside divided chunk of specific level
-  //
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 4);
-    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
-    i2 = i1 + params::degree / 4;
-
-    w = negtwiddles[twid_id + 2];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 3
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 8);
-    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
-    i2 = i1 + params::degree / 8;
-
-    w = negtwiddles[twid_id + 4];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 4
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 16);
-    i1 =
-        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
-    i2 = i1 + params::degree / 16;
-
-    w = negtwiddles[twid_id + 8];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 5
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 32);
-    i1 =
-        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
-    i2 = i1 + params::degree / 32;
-
-    w = negtwiddles[twid_id + 16];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 6
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 64);
-    i1 =
-        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
-    i2 = i1 + params::degree / 64;
-
-    w = negtwiddles[twid_id + 32];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 7
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 128);
-    i1 = 2 * (params::degree / 128) * twid_id +
-         (tid & (params::degree / 128 - 1));
-    i2 = i1 + params::degree / 128;
-
-    w = negtwiddles[twid_id + 64];
-    u = A[i1];
-    v = A[i2] * w;
-
-    A[i1] += v;
-    A[i2] = u - v;
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // from level 8, we need to check size of params degree, because we support
-  // minimum actual polynomial size = 256,  when compressed size is halfed and
-  // minimum supported compressed size is 128, so we always need first 7
-  // levels of butterfy operation, since butterfly levels are hardcoded
-  // we need to check if polynomial size is big enough to require specific level
-  // of butterfly.
-  if constexpr (params::degree >= 256) {
-    // level 8
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 256);
-      i1 = 2 * (params::degree / 256) * twid_id +
-           (tid & (params::degree / 256 - 1));
-      i2 = i1 + params::degree / 256;
-
-      w = negtwiddles[twid_id + 128];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 512) {
-    // level 9
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 512);
-      i1 = 2 * (params::degree / 512) * twid_id +
-           (tid & (params::degree / 512 - 1));
-      i2 = i1 + params::degree / 512;
-
-      w = negtwiddles[twid_id + 256];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 1024) {
-    // level 10
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 1024);
-      i1 = 2 * (params::degree / 1024) * twid_id +
-           (tid & (params::degree / 1024 - 1));
-      i2 = i1 + params::degree / 1024;
-
-      w = negtwiddles[twid_id + 512];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 2048) {
-    // level 11
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 2048);
-      i1 = 2 * (params::degree / 2048) * twid_id +
-           (tid & (params::degree / 2048 - 1));
-      i2 = i1 + params::degree / 2048;
-
-      w = negtwiddles[twid_id + 1024];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 4096) {
-    // level 12
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 4096);
-      i1 = 2 * (params::degree / 4096) * twid_id +
-           (tid & (params::degree / 4096 - 1));
-      i2 = i1 + params::degree / 4096;
-
-      w = negtwiddles[twid_id + 2048];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  // compressed size = 8192 is actual polynomial size = 16384.
-  // from this size, twiddles can't fit in constant memory,
-  // so from here, butterfly operation access device memory.
-  if constexpr (params::degree >= 8192) {
-    // level 13
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 8192);
-      i1 = 2 * (params::degree / 8192) * twid_id +
-           (tid & (params::degree / 8192 - 1));
-      i2 = i1 + params::degree / 8192;
-
-      w = negtwiddles13[twid_id];
-      u = A[i1];
-      v = A[i2] * w;
-
-      A[i1] += v;
-      A[i2] = u - v;
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-}
-
-/*
- * negacyclic inverse fft
- */
-template <class params> __device__ void NSMFFT_inverse(double2 *A) {
-
-  /* We don't make bit reverse here, since twiddles are already reversed
-   *  Each thread is always in charge of "opt/2" pairs of coefficients,
-   *  which is why we always loop through N/2 by N/opt strides
-   *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance
-   */
-
-  size_t tid = threadIdx.x;
-  size_t twid_id;
-  size_t i1, i2;
-  double2 u, w;
-
-  // divide input by compressed polynomial size
-  tid = threadIdx.x;
-  for (size_t i = 0; i < params::opt; ++i) {
-    A[tid] /= params::degree;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // none of the twiddles have equal real and imag part, so
-  // complete complex multiplication has to be done
-  // here we have more than one twiddle
-  // mapping in backward fft is reversed
-  // butterfly operation is started from last level
-
-  // compressed size = 8192 is actual polynomial size = 16384.
-  // twiddles for this size can't fit in constant memory so
-  // butterfly operation for this level acess device memory to fetch
-  // twiddles
-  if constexpr (params::degree >= 8192) {
-    // level 13
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 8192);
-      i1 = 2 * (params::degree / 8192) * twid_id +
-           (tid & (params::degree / 8192 - 1));
-      i2 = i1 + params::degree / 8192;
-
-      w = negtwiddles13[twid_id];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 4096) {
-    // level 12
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 4096);
-      i1 = 2 * (params::degree / 4096) * twid_id +
-           (tid & (params::degree / 4096 - 1));
-      i2 = i1 + params::degree / 4096;
-
-      w = negtwiddles[twid_id + 2048];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 2048) {
-    // level 11
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 2048);
-      i1 = 2 * (params::degree / 2048) * twid_id +
-           (tid & (params::degree / 2048 - 1));
-      i2 = i1 + params::degree / 2048;
-
-      w = negtwiddles[twid_id + 1024];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 1024) {
-    // level 10
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 1024);
-      i1 = 2 * (params::degree / 1024) * twid_id +
-           (tid & (params::degree / 1024 - 1));
-      i2 = i1 + params::degree / 1024;
-
-      w = negtwiddles[twid_id + 512];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 512) {
-    // level 9
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 512);
-      i1 = 2 * (params::degree / 512) * twid_id +
-           (tid & (params::degree / 512 - 1));
-      i2 = i1 + params::degree / 512;
-
-      w = negtwiddles[twid_id + 256];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 256) {
-    // level 8
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 256);
-      i1 = 2 * (params::degree / 256) * twid_id +
-           (tid & (params::degree / 256 - 1));
-      i2 = i1 + params::degree / 256;
-
-      w = negtwiddles[twid_id + 128];
-      u = A[i1] - A[i2];
-
-      A[i1] += A[i2];
-      A[i2] = u * conjugate(w);
-
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  // below level 8, we don't need to check size of params degree, because we
-  // support minimum actual polynomial size = 256,  when compressed size is
-  // halfed and minimum supported compressed size is 128, so we always need
-  // last 7 levels of butterfy operation, since butterfly levels are hardcoded
-  // we don't need to check if polynomial size is big enough to require
-  // specific level of butterfly.
-  // level 7
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 128);
-    i1 = 2 * (params::degree / 128) * twid_id +
-         (tid & (params::degree / 128 - 1));
-    i2 = i1 + params::degree / 128;
-
-    w = negtwiddles[twid_id + 64];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 6
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 64);
-    i1 =
-        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
-    i2 = i1 + params::degree / 64;
-
-    w = negtwiddles[twid_id + 32];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 5
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 32);
-    i1 =
-        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
-    i2 = i1 + params::degree / 32;
-
-    w = negtwiddles[twid_id + 16];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 4
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 16);
-    i1 =
-        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
-    i2 = i1 + params::degree / 16;
-
-    w = negtwiddles[twid_id + 8];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 3
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 8);
-    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
-    i2 = i1 + params::degree / 8;
-
-    w = negtwiddles[twid_id + 4];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 2
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 4);
-    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
-    i2 = i1 + params::degree / 4;
-
-    w = negtwiddles[twid_id + 2];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 1
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 2);
-    i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
-    i2 = i1 + params::degree / 2;
-
-    w = negtwiddles[twid_id + 1];
-    u = A[i1] - A[i2];
-
-    A[i1] += A[i2];
-    A[i2] = u * conjugate(w);
-
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-}
-
-/*
- * global batch fft
- * does fft in half size
- * unrolling half size fft result in half size + 1 elements
- * this function must be called with actual degree
- * function takes as input already compressed input
- */
-template <class params, sharedMemDegree SMD>
-__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
-                             double2 *buffer) {
-  extern __shared__ double2 sharedMemoryFFT[];
-  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
-                               : sharedMemoryFFT;
-  int tid = threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-  __syncthreads();
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
-    tid = tid + params::degree / params::opt;
-  }
-}
-
-/*
- * global batch polynomial multiplication
- * only used for fft tests
- * d_input1 and d_output must not have the same pointer
- * d_input1 can be modified inside the function
- */
-template <class params, sharedMemDegree SMD>
-__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
-                                     double2 *d_output, double2 *buffer) {
-  extern __shared__ double2 sharedMemoryFFT[];
-  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
-                               : sharedMemoryFFT;
-
-  // Move first polynomial into shared memory(if possible otherwise it will
-  // be moved in device buffer)
-  int tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-
-  // Perform direct negacyclic fourier transform
-  __syncthreads();
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  // Put the result of direct fft inside input1
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
-    tid = tid + params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // Move first polynomial into shared memory(if possible otherwise it will
-  // be moved in device buffer)
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-
-  // Perform direct negacyclic fourier transform on the second polynomial
-  __syncthreads();
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  // calculate pointwise multiplication inside fft buffer
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-
-  // Perform backward negacyclic fourier transform
-  __syncthreads();
-  NSMFFT_inverse<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  // copy results in output buffer
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
-    tid = tid + params::degree / params::opt;
-  }
-}
-
-#endif // GPU_BOOTSTRAP_FFT_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
@@ -1,13 +0,0 @@
-#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
-#define GPU_BOOTSTRAP_TWIDDLES_CUH
-
-/*
- * 'negtwiddles' are stored in constant memory for faster access times
- * because of it's limitied size, only twiddles for up to 2^12 polynomial size
- * can be stored there, twiddles for 2^13 are stored in device memory
- * 'negtwiddles13'
- */
-
-extern __constant__ double2 negtwiddles[4096];
-extern __device__ double2 negtwiddles13[4096];
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -1,51 +0,0 @@
-#include "integer/bitwise_ops.cuh"
-
-void scratch_cuda_integer_radix_bitop_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
-    bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_radix_bitop_kb<uint64_t>(
-      stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
-      params, op_type, allocate_gpu_memory);
-}
-
-void cuda_bitop_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
-    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t lwe_ciphertext_count) {
-
-  host_integer_radix_bitop_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_array_1),
-      static_cast<uint64_t *>(lwe_array_2),
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
-      lwe_ciphertext_count);
-}
-
-void cuda_bitnot_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
-
-  host_integer_radix_bitnot_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_array_in),
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
-      lwe_ciphertext_count);
-}
-
-void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
-
-  int_bitop_buffer<uint64_t> *mem_ptr =
-      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -1,51 +0,0 @@
-#ifndef CUDA_INTEGER_BITWISE_OPS_CUH
-#define CUDA_INTEGER_BITWISE_OPS_CUH
-
-#include "crypto/keyswitch.cuh"
-#include "device.h"
-#include "integer.cuh"
-#include "integer.h"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
-#include "polynomial/functions.cuh"
-#include "utils/kernel_dimensions.cuh"
-#include <omp.h>
-
-template <typename Torus>
-__host__ void
-host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                            Torus *lwe_array_1, Torus *lwe_array_2,
-                            int_bitop_buffer<Torus> *mem_ptr, void *bsk,
-                            Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto lut = mem_ptr->lut;
-
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
-      num_radix_blocks, lut);
-}
-
-template <typename Torus>
-__host__ void
-host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                             Torus *lwe_array_in,
-                             int_bitop_buffer<Torus> *mem_ptr, void *bsk,
-                             Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto lut = mem_ptr->lut;
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_radix_bitop_kb(
-    cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
-                                         allocate_gpu_memory);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -1,45 +0,0 @@
-#include "integer/cmux.cuh"
-
-void scratch_cuda_integer_radix_cmux_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  std::function<uint64_t(uint64_t)> predicate_lut_f =
-      [](uint64_t x) -> uint64_t { return x == 1; };
-
-  scratch_cuda_integer_radix_cmux_kb(
-      stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
-      lwe_ciphertext_count, params, allocate_gpu_memory);
-}
-
-void cuda_cmux_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
-    void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t lwe_ciphertext_count) {
-
-  host_integer_radix_cmux_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_condition),
-      static_cast<uint64_t *>(lwe_array_true),
-      static_cast<uint64_t *>(lwe_array_false),
-      (int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
-
-      lwe_ciphertext_count);
-}
-
-void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
-                                     int8_t **mem_ptr_void) {
-
-  int_cmux_buffer<uint64_t> *mem_ptr =
-      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -1,100 +0,0 @@
-#ifndef CUDA_INTEGER_CMUX_CUH
-#define CUDA_INTEGER_CMUX_CUH
-
-#include "integer.cuh"
-#include <omp.h>
-
-template <typename Torus>
-__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
-                          Torus *lwe_array_input, Torus *lwe_condition,
-                          int_zero_out_if_buffer<Torus> *mem_ptr,
-                          int_radix_lut<Torus> *predicate, void *bsk,
-                          Torus *ksk, uint32_t num_radix_blocks) {
-  auto params = mem_ptr->params;
-
-  int big_lwe_size = params.big_lwe_dimension + 1;
-
-  // Left message is shifted
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = (params.big_lwe_dimension + 1);
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-
-  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
-  // second operand is fixed
-  auto tmp_lwe_array_input = mem_ptr->tmp;
-  for (int i = 0; i < num_radix_blocks; i++) {
-    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
-    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
-
-    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
-                                   stream->stream>>>(
-        lwe_array_out_block, lwe_array_input_block, lwe_condition,
-        predicate->lwe_indexes, params.big_lwe_dimension,
-        params.message_modulus, 1);
-    check_cuda_error(cudaGetLastError());
-  }
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
-      predicate);
-}
-
-template <typename Torus>
-__host__ void
-host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                           Torus *lwe_condition, Torus *lwe_array_true,
-                           Torus *lwe_array_false,
-                           int_cmux_buffer<Torus> *mem_ptr, void *bsk,
-                           Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto params = mem_ptr->params;
-
-  // Since our CPU threads will be working on different streams we shall assert
-  // the work in the main stream is completed
-  stream->synchronize();
-  auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
-  auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
-
-#pragma omp parallel sections
-  {
-    // Both sections may be executed in parallel
-#pragma omp section
-    {
-      auto mem_true = mem_ptr->zero_if_true_buffer;
-      zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
-                  lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
-                  ksk, num_radix_blocks);
-    }
-#pragma omp section
-    {
-      auto mem_false = mem_ptr->zero_if_false_buffer;
-      zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
-                  lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
-                  num_radix_blocks);
-    }
-  }
-  cuda_synchronize_stream(true_stream);
-  cuda_synchronize_stream(false_stream);
-
-  // If the condition was true, true_ct will have kept its value and false_ct
-  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
-  // have kept its value
-  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
-                params.big_lwe_dimension, num_radix_blocks);
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
-      mem_ptr->message_extract_lut);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_radix_cmux_kb(
-    cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
-    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
-    int_radix_params params, bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
-                                        num_radix_blocks, allocate_gpu_memory);
-}
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -1,83 +0,0 @@
-#include "integer/comparison.cuh"
-
-void scratch_cuda_integer_radix_comparison_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
-    bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  switch (op_type) {
-  case EQ:
-  case NE:
-    scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
-        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
-    break;
-  case GT:
-  case GE:
-  case LT:
-  case LE:
-  case MAX:
-  case MIN:
-    scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
-        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
-        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
-    break;
-  }
-}
-
-void cuda_comparison_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
-    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t lwe_ciphertext_count) {
-
-  int_comparison_buffer<uint64_t> *buffer =
-      (int_comparison_buffer<uint64_t> *)mem_ptr;
-  switch (buffer->op) {
-  case EQ:
-  case NE:
-    host_integer_radix_equality_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
-        static_cast<uint64_t *>(lwe_array_1),
-        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
-    break;
-  case GT:
-  case GE:
-  case LT:
-  case LE:
-    host_integer_radix_difference_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
-        static_cast<uint64_t *>(lwe_array_1),
-        static_cast<uint64_t *>(lwe_array_2), buffer,
-        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
-        lwe_ciphertext_count);
-    break;
-  case MAX:
-  case MIN:
-    host_integer_radix_maxmin_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
-        static_cast<uint64_t *>(lwe_array_1),
-        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
-    break;
-  default:
-    PANIC("Cuda error: integer operation not supported");
-  }
-}
-
-void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
-                                     int8_t **mem_ptr_void) {
-
-  int_comparison_buffer<uint64_t> *mem_ptr =
-      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -1,527 +0,0 @@
-#ifndef CUDA_INTEGER_COMPARISON_OPS_CUH
-#define CUDA_INTEGER_COMPARISON_OPS_CUH
-
-#include "crypto/keyswitch.cuh"
-#include "device.h"
-#include "integer.cuh"
-#include "integer.h"
-#include "integer/cmux.cuh"
-#include "integer/negation.cuh"
-#include "integer/scalar_addition.cuh"
-#include "pbs/bootstrap_low_latency.cuh"
-#include "pbs/bootstrap_multibit.cuh"
-#include "types/complex/operations.cuh"
-#include "utils/kernel_dimensions.cuh"
-
-// lwe_dimension + 1 threads
-// todo: This kernel MUST be refactored to a binary reduction
-template <typename Torus>
-__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
-                                             uint32_t lwe_dimension,
-                                             uint32_t num_blocks) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < lwe_dimension + 1) {
-    auto block = &input_block[idx];
-
-    Torus sum = block[0];
-    for (int i = 1; i < num_blocks; i++) {
-      sum += block[i * (lwe_dimension + 1)];
-    }
-
-    output[idx] = sum;
-  }
-}
-
-template <typename Torus>
-__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
-                                    Torus *input, uint32_t lwe_dimension,
-                                    uint32_t num_radix_blocks) {
-
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = (lwe_dimension + 1);
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  // Add all blocks and store in sum
-  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
-      output, input, lwe_dimension, num_radix_blocks);
-  check_cuda_error(cudaGetLastError());
-}
-
-/* This takes an array of lwe ciphertexts, where each is an encryption of
- * either 0 or 1.
- *
- * It writes in lwe_array_out a single lwe ciphertext encrypting 1 if all input
- * blocks are 1 otherwise the block encrypts 0
- *
- */
-template <typename Torus>
-__host__ void
-are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
-                               Torus *lwe_array_in,
-                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                               Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  auto are_all_block_true_buffer =
-      mem_ptr->eq_buffer->are_all_block_true_buffer;
-
-  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = total_modulus - 1;
-
-  cuda_memcpy_async_gpu_to_gpu(
-      lwe_array_out, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
-
-  int lut_num_blocks = 0;
-  uint32_t remaining_blocks = num_radix_blocks;
-  while (remaining_blocks > 1) {
-    // Split in max_value chunks
-    uint32_t chunk_length = std::min(max_value, remaining_blocks);
-    int num_chunks = remaining_blocks / chunk_length;
-
-    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
-    // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = lwe_array_out;
-    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
-    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(stream, accumulator, input_blocks,
-                            big_lwe_dimension, chunk_length);
-
-      accumulator += (big_lwe_dimension + 1);
-      remaining_blocks -= (chunk_length - 1);
-      input_blocks += (big_lwe_dimension + 1) * chunk_length;
-    }
-    accumulator = are_all_block_true_buffer->tmp_block_accumulated;
-
-    // Selects a LUT
-    int_radix_lut<Torus> *lut;
-    if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
-      // is_non_zero_lut_buffer LUT
-      lut = mem_ptr->eq_buffer->is_non_zero_lut;
-    } else if (chunk_length == max_value) {
-      // is_max_value LUT
-      lut = are_all_block_true_buffer->is_max_value_lut;
-    } else {
-      // is_equal_to_num_blocks LUT
-      lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
-      if (chunk_length != lut_num_blocks) {
-        auto is_equal_to_num_blocks_lut_f = [max_value,
-                                             chunk_length](Torus x) -> Torus {
-          return (x & max_value) == chunk_length;
-        };
-        generate_device_accumulator<Torus>(
-            stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-            carry_modulus, is_equal_to_num_blocks_lut_f);
-
-        // We don't have to generate this lut again
-        lut_num_blocks = chunk_length;
-      }
-    }
-
-    // Applies the LUT
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
-  }
-}
-
-/* This takes an array of lwe ciphertexts, where each is an encryption of
- * either 0 or 1.
- *
- * It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
- * one input ciphertext encrypts 1 otherwise encrypts 0
- */
-template <typename Torus>
-__host__ void is_at_least_one_comparisons_block_true(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
-    uint32_t num_radix_blocks) {
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
-
-  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t max_value = total_modulus - 1;
-
-  cuda_memcpy_async_gpu_to_gpu(
-      lwe_array_out, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
-
-  uint32_t remaining_blocks = num_radix_blocks;
-  while (remaining_blocks > 1) {
-    // Split in max_value chunks
-    uint32_t chunk_length = std::min(max_value, remaining_blocks);
-    int num_chunks = remaining_blocks / chunk_length;
-
-    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
-    // as in the worst case we will be adding `max_value` ones
-    auto input_blocks = lwe_array_out;
-    auto accumulator = buffer->tmp_block_accumulated;
-    for (int i = 0; i < num_chunks; i++) {
-      accumulate_all_blocks(stream, accumulator, input_blocks,
-                            big_lwe_dimension, chunk_length);
-
-      accumulator += (big_lwe_dimension + 1);
-      remaining_blocks -= (chunk_length - 1);
-      input_blocks += (big_lwe_dimension + 1) * chunk_length;
-    }
-    accumulator = buffer->tmp_block_accumulated;
-
-    // Selects a LUT
-    int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
-
-    // Applies the LUT
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
-  }
-}
-
-// This takes an input slice of blocks.
-//
-// Each block can encrypt any value as long as its < message_modulus.
-//
-// It will compare blocks with 0, for either equality or difference.
-//
-// This returns a Vec of block, where each block encrypts 1 or 0
-// depending of if all blocks matched with the comparison type with 0.
-//
-// E.g. For ZeroComparisonType::Equality, if all input blocks are zero
-// than all returned block will encrypt 1
-//
-// The returned Vec will have less block than the number of input blocks.
-// The returned blocks potentially needs to be 'reduced' to one block
-// with eg are_all_comparisons_block_true.
-//
-// This function exists because sometimes it is faster to concatenate
-// multiple vec of 'boolean' shortint block before reducing them with
-// are_all_comparisons_block_true
-template <typename Torus>
-__host__ void host_compare_with_zero_equality(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
-    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // The idea is that we will sum chunks of blocks until carries are full
-  // then we compare the sum with 0.
-  //
-  // If all blocks were 0, the sum will be zero
-  // If at least one bock was not zero, the sum won't be zero
-  uint32_t total_modulus = message_modulus * carry_modulus;
-  uint32_t message_max = message_modulus - 1;
-
-  uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;
-
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  int num_sum_blocks = 0;
-  // Accumulator
-  auto sum = lwe_array_out;
-
-  if (num_radix_blocks == 1) {
-    // Just copy
-    cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
-    num_sum_blocks = 1;
-  } else {
-    uint32_t remainder_blocks = num_radix_blocks;
-    auto sum_i = sum;
-    auto chunk = lwe_array_in;
-    while (remainder_blocks > 1) {
-      uint32_t chunk_size =
-          std::min(remainder_blocks, num_elements_to_fill_carry);
-
-      accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
-                            chunk_size);
-
-      num_sum_blocks++;
-      remainder_blocks -= (chunk_size - 1);
-
-      // Update operands
-      chunk += chunk_size * big_lwe_size;
-      sum_i += big_lwe_size;
-    }
-  }
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
-  are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
-                                 num_sum_blocks);
-
-  // The result will be in the two first block. Everything else is
-  //  garbage.
-  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_equality_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
-    Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto eq_buffer = mem_ptr->eq_buffer;
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-
-  // Applies the LUT for the comparison operation
-  auto comparisons = mem_ptr->tmp_block_comparisons;
-  integer_radix_apply_bivariate_lookup_table_kb(
-      stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
-      eq_buffer->operator_lut);
-
-  // This takes a Vec of blocks, where each block is either 0 or 1.
-  //
-  // It return a block encrypting 1 if all input blocks are 1
-  // otherwise the block encrypts 0
-  are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
-                                 bsk, ksk, num_radix_blocks);
-
-  // Zero all blocks but the first
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_radix_equality_check_kb(
-    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_comparison_buffer<Torus>(
-      stream, op, params, num_radix_blocks, allocate_gpu_memory);
-}
-
-template <typename Torus>
-__host__ void
-compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                        Torus *lwe_array_left, Torus *lwe_array_right,
-                        int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                        Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
-  // be set to 1
-  // meaning that the output of the pbs will be the negative (modulo message
-  // space)
-  //
-  // Example:
-  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
-  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
-  // Since there was an overflow the bit of padding is 1 and not 0.
-  // When applying the LUT for an input value of 14 we would expect 1,
-  // but since the bit of padding is 1, we will get -1 modulus our message
-  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
-
-  // Subtract
-  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
-                   big_lwe_dimension, num_radix_blocks);
-
-  // Apply LUT to compare to 0
-  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(
-      stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
-      is_non_zero_lut);
-
-  // Add one
-  // Here Lhs can have the following values: (-1) % (message modulus * carry
-  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
-                                            big_lwe_dimension, num_radix_blocks,
-                                            message_modulus, carry_modulus);
-}
-
-// Reduces a vec containing shortint blocks that encrypts a sign
-// (inferior, equal, superior) to one single shortint block containing the
-// final sign
-template <typename Torus>
-__host__ void
-tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
-                    Torus *lwe_block_comparisons,
-                    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
-                    std::function<Torus(Torus)> sign_handler_f, void *bsk,
-                    Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto params = tree_buffer->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // Tree reduction
-  // Reduces a vec containing shortint blocks that encrypts a sign
-  // (inferior, equal, superior) to one single shortint block containing the
-  // final sign
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto x = tree_buffer->tmp_x;
-  auto y = tree_buffer->tmp_y;
-  if (x != lwe_block_comparisons)
-    cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
-                                 big_lwe_size_bytes * num_radix_blocks, stream);
-
-  uint32_t partial_block_count = num_radix_blocks;
-
-  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
-  while (partial_block_count > 2) {
-    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
-
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
-
-    if ((partial_block_count % 2) != 0) {
-      partial_block_count >>= 1;
-      partial_block_count++;
-
-      auto last_y_block = y + (partial_block_count - 1) * big_lwe_size;
-      auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
-
-      cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
-                                   big_lwe_size_bytes, stream);
-    } else {
-      partial_block_count >>= 1;
-    }
-  }
-
-  auto last_lut = tree_buffer->tree_last_leaf_lut;
-  auto block_selector_f = tree_buffer->block_selector_f;
-  std::function<Torus(Torus)> f;
-
-  if (partial_block_count == 2) {
-    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
-
-    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
-      int msb = (x >> 2) & 3;
-      int lsb = x & 3;
-
-      int final_sign = block_selector_f(msb, lsb);
-      return sign_handler_f(final_sign);
-    };
-  } else {
-    // partial_block_count == 1
-    y = x;
-    f = sign_handler_f;
-  }
-  generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
-                                     polynomial_size, message_modulus,
-                                     carry_modulus, f);
-
-  // Last leaf
-  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
-                                                 ksk, 1, last_lut);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
-    Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
-    uint32_t total_num_radix_blocks) {
-
-  auto diff_buffer = mem_ptr->diff_buffer;
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  uint32_t num_radix_blocks = total_num_radix_blocks;
-  auto lhs = lwe_array_left;
-  auto rhs = lwe_array_right;
-  if (carry_modulus == message_modulus) {
-    // Packing is possible
-    // Pack inputs
-    Torus *packed_left = diff_buffer->tmp_packed_left;
-    Torus *packed_right = diff_buffer->tmp_packed_right;
-    pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
-                num_radix_blocks, message_modulus);
-    pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
-                num_radix_blocks, message_modulus);
-    // From this point we have half number of blocks
-    num_radix_blocks /= 2;
-
-    // Clean noise
-    auto cleaning_lut = mem_ptr->cleaning_lut;
-    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
-        cleaning_lut);
-    integer_radix_apply_univariate_lookup_table_kb(
-        stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
-        cleaning_lut);
-
-    lhs = packed_left;
-    rhs = packed_right;
-  }
-
-  // comparisons will be assigned
-  // - 0 if lhs < rhs
-  // - 1 if lhs == rhs
-  // - 2 if lhs > rhs
-  auto comparisons = mem_ptr->tmp_block_comparisons;
-  compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
-                          num_radix_blocks);
-
-  // Reduces a vec containing radix blocks that encrypts a sign
-  // (inferior, equal, superior) to one single radix block containing the
-  // final sign
-  tree_sign_reduction(stream, lwe_array_out, comparisons,
-                      mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
-                      ksk, num_radix_blocks);
-
-  // The result will be in the first block. Everything else is garbage.
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                    (total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_radix_difference_check_kb(
-    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_comparison_buffer<Torus>(
-      stream, op, params, num_radix_blocks, allocate_gpu_memory);
-}
-
-template <typename Torus>
-__host__ void
-host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                             Torus *lwe_array_left, Torus *lwe_array_right,
-                             int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                             Torus *ksk, uint32_t total_num_radix_blocks) {
-
-  // Compute the sign
-  host_integer_radix_difference_check_kb(
-      stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
-      mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);
-
-  // Selector
-  host_integer_radix_cmux_kb(
-      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -1,127 +0,0 @@
-#include "integer/integer.cuh"
-#include <linear_algebra.h>
-
-void cuda_full_propagation_64_inplace(
-    cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
-    void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
-    uint32_t num_blocks) {
-
-  switch (polynomial_size) {
-  case 256:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 512:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 1024:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 2048:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 4096:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 8192:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  case 16384:
-    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
-        stream, static_cast<uint64_t *>(input_blocks),
-        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
-        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
-        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
-    break;
-  default:
-    break;
-  }
-}
-
-void scratch_cuda_full_propagation_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
-
-  scratch_cuda_full_propagation<uint64_t>(
-      stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
-      glwe_dimension, polynomial_size, level_count, grouping_factor,
-      input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
-      allocate_gpu_memory);
-}
-
-void cleanup_cuda_full_propagation(cuda_stream_t *stream,
-                                   int8_t **mem_ptr_void) {
-
-  int_fullprop_buffer<uint64_t> *mem_ptr =
-      (int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
-
-  cuda_drop_async(mem_ptr->lut_buffer, stream);
-  cuda_drop_async(mem_ptr->lut_indexes, stream);
-
-  cuda_drop_async(mem_ptr->pbs_buffer, stream);
-
-  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
-  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
-}
-
-void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
-      stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
-      allocate_gpu_memory);
-}
-
-void cuda_propagate_single_carry_low_latency_kb_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t num_blocks) {
-  host_propagate_single_carry_low_latency<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array),
-      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
-      static_cast<uint64_t *>(ksk), num_blocks);
-}
-
-void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
-                                                     int8_t **mem_ptr_void) {
-  int_sc_prop_memory<uint64_t> *mem_ptr =
-      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release(stream);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -1,584 +0,0 @@
-#ifndef CUDA_INTEGER_CUH
-#define CUDA_INTEGER_CUH
-
-#include "bootstrap.h"
-#include "crypto/keyswitch.cuh"
-#include "device.h"
-#include "integer.h"
-#include "integer/scalar_addition.cuh"
-#include "linear_algebra.h"
-#include "linearalgebra/addition.cuh"
-#include "polynomial/functions.cuh"
-#include "utils/kernel_dimensions.cuh"
-#include <functional>
-
-// function rotates right  radix ciphertext with specific value
-// grid is one dimensional
-// blockIdx.x represents x_th block of radix ciphertext
-template <typename Torus>
-__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
-                                          uint32_t value, uint32_t blocks_count,
-                                          uint32_t lwe_size) {
-  value %= blocks_count;
-
-  size_t tid = threadIdx.x;
-  size_t src_block_id = blockIdx.x;
-  size_t dst_block_id = (src_block_id + value) % blocks_count;
-  size_t stride = blockDim.x;
-
-  auto cur_src_block = &src[src_block_id * lwe_size];
-  auto cur_dst_block = &dst[dst_block_id * lwe_size];
-
-  for (size_t i = tid; i < lwe_size; i += stride) {
-    cur_dst_block[i] = cur_src_block[i];
-  }
-}
-
-// function rotates left  radix ciphertext with specific value
-// grid is one dimensional
-// blockIdx.x represents x_th block of radix ciphertext
-template <typename Torus>
-__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
-                                         uint32_t blocks_count,
-                                         uint32_t lwe_size) {
-  value %= blocks_count;
-  size_t src_block_id = blockIdx.x;
-
-  size_t tid = threadIdx.x;
-  size_t dst_block_id = (src_block_id >= value)
-                            ? src_block_id - value
-                            : src_block_id - value + blocks_count;
-  size_t stride = blockDim.x;
-
-  auto cur_src_block = &src[src_block_id * lwe_size];
-  auto cur_dst_block = &dst[dst_block_id * lwe_size];
-
-  for (size_t i = tid; i < lwe_size; i += stride) {
-    cur_dst_block[i] = cur_src_block[i];
-  }
-}
-
-// polynomial_size threads
-template <typename Torus>
-__global__ void
-device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_array_1,
-                             Torus *lwe_array_2, Torus *lwe_indexes,
-                             uint32_t lwe_dimension, uint32_t message_modulus,
-                             uint32_t num_blocks) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (tid < num_blocks * (lwe_dimension + 1)) {
-    int block_id = tid / (lwe_dimension + 1);
-    int coeff_id = tid % (lwe_dimension + 1);
-
-    int pos = lwe_indexes[block_id] * (lwe_dimension + 1) + coeff_id;
-    lwe_array_out[pos] = lwe_array_1[pos] * message_modulus + lwe_array_2[pos];
-  }
-}
-
-template <typename Torus>
-__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
-                                    Torus *lwe_array_1, Torus *lwe_array_2,
-                                    Torus *lwe_indexes, uint32_t lwe_dimension,
-                                    uint32_t message_modulus,
-                                    uint32_t num_radix_blocks) {
-
-  // Left message is shifted
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_radix_blocks * (lwe_dimension + 1);
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
-      lwe_array_out, lwe_array_1, lwe_array_2, lwe_indexes, lwe_dimension,
-      message_modulus, num_radix_blocks);
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus>
-__host__ void integer_radix_apply_univariate_lookup_table_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
-  // apply_lookup_table
-  auto params = lut->params;
-  auto pbs_type = params.pbs_type;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto small_lwe_dimension = params.small_lwe_dimension;
-  auto ks_level = params.ks_level;
-  auto ks_base_log = params.ks_base_log;
-  auto pbs_level = params.pbs_level;
-  auto pbs_base_log = params.pbs_base_log;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto grouping_factor = params.grouping_factor;
-
-  // Compute Keyswitch-PBS
-  cuda_keyswitch_lwe_ciphertext_vector(
-      stream, lut->tmp_lwe_after_ks, lut->lwe_indexes, lwe_array_in,
-      lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
-      ks_base_log, ks_level, num_radix_blocks);
-
-  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
-                     lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
-                     bsk, lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
-                     polynomial_size, pbs_base_log, pbs_level, grouping_factor,
-                     num_radix_blocks, 1, 0,
-                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
-}
-
-template <typename Torus>
-__host__ void integer_radix_apply_bivariate_lookup_table_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
-    Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
-    int_radix_lut<Torus> *lut) {
-  // apply_lookup_table_bivariate
-
-  auto params = lut->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-
-  // Left message is shifted
-  pack_bivariate_blocks(stream, lut->tmp_lwe_before_ks, lwe_array_1,
-                        lwe_array_2, lut->lwe_indexes, big_lwe_dimension,
-                        message_modulus, num_radix_blocks);
-  check_cuda_error(cudaGetLastError());
-
-  // Apply LUT
-  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
-                                                 lut->tmp_lwe_before_ks, bsk,
-                                                 ksk, num_radix_blocks, lut);
-}
-
-// Rotates the slice in-place such that the first mid elements of the slice move
-// to the end while the last array_length elements move to the front. After
-// calling rotate_left, the element previously at index mid will become the
-// first element in the slice.
-template <typename Torus>
-void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
-  mid = mid % array_length;
-
-  std::rotate(buffer, buffer + mid, buffer + array_length);
-}
-
-template <typename Torus>
-void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
-                           uint32_t polynomial_size, uint32_t message_modulus,
-                           uint32_t carry_modulus,
-                           std::function<Torus(Torus)> f) {
-
-  uint32_t modulus_sup = message_modulus * carry_modulus;
-  uint32_t box_size = polynomial_size / modulus_sup;
-  Torus delta = (1ul << 63) / modulus_sup;
-
-  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
-
-  auto body = &acc[glwe_dimension * polynomial_size];
-
-  // This accumulator extracts the carry bits
-  for (int i = 0; i < modulus_sup; i++) {
-    int index = i * box_size;
-    for (int j = index; j < index + box_size; j++) {
-      auto f_eval = f(i);
-      body[j] = f_eval * delta;
-    }
-  }
-
-  int half_box_size = box_size / 2;
-
-  // Negate the first half_box_size coefficients
-  for (int i = 0; i < half_box_size; i++) {
-    body[i] = -body[i];
-  }
-
-  rotate_left(body, half_box_size, polynomial_size);
-}
-
-template <typename Torus>
-void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
-                                     uint32_t polynomial_size,
-                                     uint32_t message_modulus,
-                                     uint32_t carry_modulus,
-                                     std::function<Torus(Torus, Torus)> f) {
-
-  Torus factor_u64 = message_modulus;
-  auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
-    Torus lhs = (input / factor_u64) % message_modulus;
-    Torus rhs = (input % factor_u64) % message_modulus;
-
-    return f(lhs, rhs);
-  };
-
-  generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
-                               message_modulus, carry_modulus, wrapped_f);
-}
-
-/*
- *  generate bivariate accumulator for device pointer
- *    v_stream - cuda stream
- *    acc - device pointer for bivariate accumulator
- *    ...
- *    f - wrapping function with two Torus inputs
- */
-template <typename Torus>
-void generate_device_accumulator_bivariate(
-    cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
-    std::function<Torus(Torus, Torus)> f) {
-
-  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-  // fill bivariate accumulator
-  generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
-                                         message_modulus, carry_modulus, f);
-
-  // copy host lut and lut_indexes to device
-  cuda_memcpy_async_to_gpu(
-      acc_bivariate, h_lut,
-      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
-
-  cuda_synchronize_stream(stream);
-  free(h_lut);
-}
-
-/*
- *  generate bivariate accumulator for device pointer
- *    v_stream - cuda stream
- *    acc - device pointer for accumulator
- *    ...
- *    f - evaluating function with one Torus input
- */
-template <typename Torus>
-void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
-                                 uint32_t glwe_dimension,
-                                 uint32_t polynomial_size,
-                                 uint32_t message_modulus,
-                                 uint32_t carry_modulus,
-                                 std::function<Torus(Torus)> f) {
-
-  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-  // fill accumulator
-  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
-                               message_modulus, carry_modulus, f);
-
-  // copy host lut and lut_indexes to device
-  cuda_memcpy_async_to_gpu(
-      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
-      stream);
-
-  cuda_synchronize_stream(stream);
-  free(h_lut);
-}
-
-template <typename Torus>
-void scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
-    cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
-                                           allocate_gpu_memory);
-}
-
-template <typename Torus>
-void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
-                                             Torus *lwe_array,
-                                             int_sc_prop_memory<Torus> *mem,
-                                             void *bsk, Torus *ksk,
-                                             uint32_t num_blocks) {
-  auto params = mem->params;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto big_lwe_size = glwe_dimension * polynomial_size + 1;
-  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto generates_or_propagates = mem->generates_or_propagates;
-  auto step_output = mem->step_output;
-
-  auto luts_array = mem->luts_array;
-  auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
-  auto message_acc = mem->message_acc;
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
-      luts_array);
-
-  // compute prefix sum with hillis&steele
-
-  int num_steps = ceil(log2((double)num_blocks));
-  int space = 1;
-  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
-                               big_lwe_size_bytes * num_blocks, stream);
-
-  for (int step = 0; step < num_steps; step++) {
-    auto cur_blocks = &step_output[space * big_lwe_size];
-    auto prev_blocks = generates_or_propagates;
-    int cur_total_blocks = num_blocks - space;
-
-    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
-        luts_carry_propagation_sum);
-
-    cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
-                                 cur_blocks,
-                                 big_lwe_size_bytes * cur_total_blocks, stream);
-    space *= 2;
-  }
-
-  radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
-      step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
-  cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
-
-  host_addition(stream, lwe_array, lwe_array, step_output,
-                glwe_dimension * polynomial_size, num_blocks);
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
-}
-
-/*
- * input_blocks: input radix ciphertext propagation will happen inplace
- * acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
- * lut_indexes_message_carry: lut_indexes for message and carry, should always
- * be  {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
- * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
- *     size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
- */
-template <typename Torus, typename STorus, class params>
-void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
-                                 int_fullprop_buffer<Torus> *mem_ptr,
-                                 Torus *ksk, void *bsk, uint32_t lwe_dimension,
-                                 uint32_t glwe_dimension,
-                                 uint32_t polynomial_size, uint32_t ks_base_log,
-                                 uint32_t ks_level, uint32_t pbs_base_log,
-                                 uint32_t pbs_level, uint32_t grouping_factor,
-                                 uint32_t num_blocks) {
-
-  int big_lwe_size = (glwe_dimension * polynomial_size + 1);
-  int small_lwe_size = (lwe_dimension + 1);
-
-  for (int i = 0; i < num_blocks; i++) {
-    auto cur_input_block = &input_blocks[i * big_lwe_size];
-
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
-        stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
-        cur_input_block, mem_ptr->lwe_indexes, ksk,
-        polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
-        1);
-
-    cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
-                                 mem_ptr->tmp_small_lwe_vector,
-                                 small_lwe_size * sizeof(Torus), stream);
-
-    execute_pbs<Torus>(
-        stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
-        mem_ptr->lut_buffer, mem_ptr->lut_indexes,
-        mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
-        mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
-        pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
-        cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
-
-    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
-                                 big_lwe_size * sizeof(Torus), stream);
-
-    if (i < num_blocks - 1) {
-      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
-      host_addition(stream, next_input_block, next_input_block,
-                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
-                    glwe_dimension * polynomial_size, 1);
-    }
-  }
-}
-
-template <typename Torus>
-void scratch_cuda_full_propagation(
-    cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
-
-  int8_t *pbs_buffer;
-  execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
-                             polynomial_size, pbs_level, grouping_factor,
-                             num_radix_blocks,
-                             cuda_get_max_shared_memory(stream->gpu_index),
-                             pbs_type, allocate_gpu_memory);
-
-  // LUT
-  Torus *lut_buffer;
-  if (allocate_gpu_memory) {
-    // LUT is used as a trivial encryption, so we only allocate memory for the
-    // body
-    Torus lut_buffer_size =
-        2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
-
-    lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
-
-    // LUTs
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    //
-    Torus *lut_buffer_message = lut_buffer;
-    Torus *lut_buffer_carry =
-        lut_buffer + (glwe_dimension + 1) * polynomial_size;
-
-    generate_device_accumulator<Torus>(
-        stream, lut_buffer_message, glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, lut_f_message);
-
-    generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, lut_f_carry);
-  }
-
-  Torus *lut_indexes;
-  if (allocate_gpu_memory) {
-    lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
-
-    Torus h_lut_indexes[2] = {0, 1};
-    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
-                             stream);
-  }
-
-  Torus *lwe_indexes;
-  if (allocate_gpu_memory) {
-    Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
-
-    lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
-    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
-    for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
-    cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
-                             stream);
-    cuda_synchronize_stream(stream);
-    free(h_lwe_indexes);
-  }
-
-  // Temporary arrays
-  Torus *small_lwe_vector;
-  Torus *big_lwe_vector;
-  if (allocate_gpu_memory) {
-    Torus small_vector_size = 2 * (lwe_dimension + 1) * sizeof(Torus);
-    Torus big_vector_size =
-        2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
-
-    small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
-    big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
-  }
-
-  *mem_ptr = new int_fullprop_buffer<Torus>;
-
-  (*mem_ptr)->pbs_type = pbs_type;
-  (*mem_ptr)->pbs_buffer = pbs_buffer;
-
-  (*mem_ptr)->lut_buffer = lut_buffer;
-  (*mem_ptr)->lut_indexes = lut_indexes;
-  (*mem_ptr)->lwe_indexes = lwe_indexes;
-
-  (*mem_ptr)->tmp_small_lwe_vector = small_lwe_vector;
-  (*mem_ptr)->tmp_big_lwe_vector = big_lwe_vector;
-}
-
-// (lwe_dimension+1) threads
-// (num_radix_blocks / 2) thread blocks
-template <typename Torus>
-__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
-                                   uint32_t lwe_dimension,
-                                   uint32_t num_radix_blocks, uint32_t factor) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (tid < (lwe_dimension + 1)) {
-    for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
-      Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
-      Torus *msb_block = lsb_block + (lwe_dimension + 1);
-
-      Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
-
-      packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
-    }
-
-    if (num_radix_blocks % 2 != 0) {
-      // We couldn't pack the last block, so we just copy it
-      Torus *lsb_block =
-          lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
-      Torus *last_block =
-          lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
-
-      last_block[tid] = lsb_block[tid];
-    }
-  }
-}
-
-// Packs the low ciphertext in the message parts of the high ciphertext
-// and moves the high ciphertext into the carry part.
-//
-// This requires the block parameters to have enough room for two ciphertexts,
-// so at least as many carry modulus as the message modulus
-//
-// Expects the carry buffer to be empty
-template <typename Torus>
-__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
-                          Torus *lwe_array_in, uint32_t lwe_dimension,
-                          uint32_t num_radix_blocks, uint32_t factor) {
-  assert(lwe_array_out != lwe_array_in);
-
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = (lwe_dimension + 1);
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
-      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
-}
-
-template <typename Torus>
-__global__ void
-device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
-                            int32_t num_blocks, uint32_t lwe_dimension,
-                            uint64_t delta) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num_blocks) {
-    Torus scalar = scalar_input[tid];
-    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
-
-    *body = scalar * delta;
-  }
-}
-
-template <typename Torus>
-__host__ void
-create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
-                     Torus *scalar_array, uint32_t lwe_dimension,
-                     uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
-                     uint64_t message_modulus, uint64_t carry_modulus) {
-
-  size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
-  cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
-
-  if (num_scalar_blocks == 0)
-    return;
-
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_scalar_blocks;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
-      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
-  check_cuda_error(cudaGetLastError());
-}
-
-#endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -1,107 +0,0 @@
-#include "integer/multiplication.cuh"
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the integer radix multiplication in keyswitch->bootstrap order.
- */
-void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
-    uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          polynomial_size, lwe_dimension, ks_level, ks_base_log,
-                          pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  switch (polynomial_size) {
-  case 2048:
-    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
-        stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
-        allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Computes a multiplication between two 64 bit radix lwe ciphertexts
- * encrypting integer values. keyswitch -> bootstrap pattern is used, function
- * works for single pair of radix ciphertexts, 'v_stream' can be used for
- * parallelization
- * - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - 'gpu_index' is the index of the GPU to be used in the kernel launch
- * - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
- * multiplication
- * - 'radix_lwe_left' left radix big lwe ciphertext
- * - 'radix_lwe_right' right radix big lwe ciphertext
- * - 'bsk' bootstrapping key in fourier domain
- * - 'ksk' keyswitching key
- * - 'mem_ptr'
- * - 'message_modulus' message_modulus
- * - 'carry_modulus' carry_modulus
- * - 'glwe_dimension' glwe_dimension
- * - 'lwe_dimension' is the dimension of small lwe ciphertext
- * - 'polynomial_size' polynomial size
- * - 'pbs_base_log' base log used in the pbs
- * - 'pbs_level' decomposition level count used in the pbs
- * - 'ks_level' decomposition level count used in the keyswitch
- * - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
- * ciphertext
- * - 'pbs_type' selects which PBS implementation should be used
- * - 'max_shared_memory' maximum shared memory per cuda block
- */
-void cuda_integer_mult_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
-    void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
-    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
-    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
-    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
-    uint32_t max_shared_memory) {
-
-  switch (polynomial_size) {
-  case 2048:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
-        stream, static_cast<uint64_t *>(radix_lwe_out),
-        static_cast<uint64_t *>(radix_lwe_left),
-        static_cast<uint64_t *>(radix_lwe_right), bsk,
-        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
-        num_blocks);
-    break;
-  default:
-    break;
-  }
-}
-
-void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
-
-  int_mul_memory<uint64_t> *mem_ptr =
-      (int_mul_memory<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(stream);
-}
-
-void cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint64_t scalar,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
-
-  cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
-      stream, lwe_array, lwe_array, scalar, lwe_dimension,
-      lwe_ciphertext_count);
-}
-
-void cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
-    cuda_stream_t *stream, void *output_lwe_array, void *input_lwe_array,
-    uint64_t scalar, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
-
-  host_integer_small_scalar_mult_radix(
-      stream, static_cast<uint64_t *>(output_lwe_array),
-      static_cast<uint64_t *>(input_lwe_array), scalar, lwe_dimension,
-      lwe_ciphertext_count);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -1,630 +0,0 @@
-#ifndef CUDA_INTEGER_MULT_CUH
-#define CUDA_INTEGER_MULT_CUH
-
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#include "bootstrap.h"
-#include "crypto/keyswitch.cuh"
-#include "device.h"
-#include "integer.h"
-#include "integer/integer.cuh"
-#include "linear_algebra.h"
-#include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
-#include <fstream>
-#include <iostream>
-#include <omp.h>
-#include <sstream>
-#include <string>
-#include <vector>
-
-template <typename Torus, class params>
-__global__ void
-all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
-                    Torus *msb_ciphertext, Torus *radix_lwe_right,
-                    Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {
-
-  size_t block_id = blockIdx.x;
-  double D = sqrt((2 * num_blocks + 1) * (2 * num_blocks + 1) - 8 * block_id);
-  size_t radix_id = int((2 * num_blocks + 1 - D) / 2.);
-  size_t local_block_id =
-      block_id - (2 * num_blocks - radix_id + 1) / 2. * radix_id;
-  bool process_msb = (local_block_id < (num_blocks - radix_id - 1));
-  auto cur_lsb_block = &lsb_ciphertext[block_id * (params::degree + 1)];
-  auto cur_msb_block =
-      (process_msb)
-          ? &msb_ciphertext[(block_id - radix_id) * (params::degree + 1)]
-          : nullptr;
-
-  auto cur_lsb_rhs_block = &lsb_rhs[block_id * (params::degree + 1)];
-  auto cur_msb_rhs_block =
-      (process_msb) ? &msb_rhs[(block_id - radix_id) * (params::degree + 1)]
-                    : nullptr;
-
-  auto cur_ct_right = &radix_lwe_right[radix_id * (params::degree + 1)];
-  auto cur_src = &radix_lwe_left[local_block_id * (params::degree + 1)];
-
-  size_t tid = threadIdx.x;
-
-  for (int i = 0; i < params::opt; i++) {
-    Torus value = cur_src[tid];
-    if (process_msb) {
-      cur_lsb_block[tid] = cur_msb_block[tid] = value;
-      cur_lsb_rhs_block[tid] = cur_msb_rhs_block[tid] = cur_ct_right[tid];
-    } else {
-      cur_lsb_block[tid] = value;
-      cur_lsb_rhs_block[tid] = cur_ct_right[tid];
-    }
-    tid += params::degree / params::opt;
-  }
-  if (threadIdx.x == 0) {
-    Torus value = cur_src[params::degree];
-    if (process_msb) {
-      cur_lsb_block[params::degree] = cur_msb_block[params::degree] = value;
-      cur_lsb_rhs_block[params::degree] = cur_msb_rhs_block[params::degree] =
-          cur_ct_right[params::degree];
-    } else {
-      cur_lsb_block[params::degree] = value;
-      cur_lsb_rhs_block[params::degree] = cur_ct_right[params::degree];
-    }
-  }
-}
-
-template <typename Torus>
-void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
-                                    Torus *dst, int *S, int *F, int num_blocks,
-                                    uint32_t map_size, uint32_t unit_size,
-                                    int &total_copied, bool is_message) {
-  for (int i = 0; i < map_size; i++) {
-    int s_index = i * num_blocks + S[i];
-    int number_of_unit = F[i] - S[i] + is_message;
-    auto cur_dst = &dst[total_copied * unit_size];
-    auto cur_src = &src[s_index * unit_size];
-    size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
-    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
-    total_copied += number_of_unit;
-  }
-}
-
-template <typename Torus>
-void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
-                                         Torus *dst, int *S, int *F,
-                                         uint32_t map_size, uint32_t unit_size,
-                                         int &total_copied,
-                                         int &total_radix_copied,
-                                         int num_blocks, bool is_message) {
-  size_t radix_size = unit_size * num_blocks;
-  for (int i = 0; i < map_size; i++) {
-    auto cur_dst_radix = &dst[total_radix_copied * radix_size];
-
-    int s_index = S[i];
-    int number_of_unit = F[i] - s_index + is_message;
-
-    if (!is_message) {
-      int zero_block_count = num_blocks - number_of_unit;
-      cuda_memset_async(cur_dst_radix, 0,
-                        zero_block_count * unit_size * sizeof(Torus), stream);
-      s_index = zero_block_count;
-    }
-
-    auto cur_dst = &cur_dst_radix[s_index * unit_size];
-    auto cur_src = &src[total_copied * unit_size];
-
-    size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
-    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
-    total_copied += number_of_unit;
-    ++total_radix_copied;
-  }
-}
-
-template <typename Torus, class params>
-__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
-                                uint32_t chunk_size, uint32_t num_blocks) {
-
-  extern __shared__ Torus result[];
-  size_t chunk_id = blockIdx.x;
-  size_t chunk_elem_size = chunk_size * num_blocks * (params::degree + 1);
-  size_t radix_elem_size = num_blocks * (params::degree + 1);
-  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
-  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
-  size_t block_stride = blockIdx.y * (params::degree + 1);
-  auto dst_block = &dst_radix[block_stride];
-
-  // init shared mem with first radix of chunk
-  size_t tid = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
-    result[tid] = src_chunk[block_stride + tid];
-    tid += params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0) {
-    result[params::degree] = src_chunk[block_stride + params::degree];
-  }
-
-  // accumulate rest  of the radixes
-  for (int r_id = 1; r_id < chunk_size; r_id++) {
-    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      result[tid] += cur_src_radix[block_stride + tid];
-      tid += params::degree / params::opt;
-    }
-    if (threadIdx.x == 0) {
-      result[params::degree] += cur_src_radix[block_stride + params::degree];
-    }
-  }
-
-  // put result from shared mem to global mem
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
-    dst_block[tid] = result[tid];
-    tid += params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0) {
-    dst_block[params::degree] = result[params::degree];
-  }
-}
-
-template <typename Torus, class params>
-__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
-                                        Torus *msb_blocks,
-                                        uint32_t glwe_dimension,
-                                        uint32_t lsb_count, uint32_t msb_count,
-                                        uint32_t num_blocks) {
-  size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
-  size_t big_lwe_id = blockIdx.x;
-  size_t radix_id = big_lwe_id / num_blocks;
-  size_t block_id = big_lwe_id % num_blocks;
-  size_t lsb_block_id = block_id - radix_id;
-  size_t msb_block_id = block_id - radix_id - 1;
-
-  bool process_lsb = (radix_id <= block_id);
-  bool process_msb = (radix_id + 1 <= block_id);
-
-  auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_dimension];
-  auto cur_res_msb_ct =
-      &result_blocks[num_blocks * num_blocks * big_lwe_dimension +
-                     big_lwe_id * big_lwe_dimension];
-  Torus *cur_lsb_radix = &lsb_blocks[(2 * num_blocks - radix_id + 1) *
-                                     radix_id / 2 * (params::degree + 1)];
-  Torus *cur_msb_radix = (process_msb)
-                             ? &msb_blocks[(2 * num_blocks - radix_id - 1) *
-                                           radix_id / 2 * (params::degree + 1)]
-                             : nullptr;
-  Torus *cur_lsb_ct = (process_lsb)
-                          ? &cur_lsb_radix[lsb_block_id * (params::degree + 1)]
-                          : nullptr;
-  Torus *cur_msb_ct = (process_msb)
-                          ? &cur_msb_radix[msb_block_id * (params::degree + 1)]
-                          : nullptr;
-  size_t tid = threadIdx.x;
-
-  for (int i = 0; i < params::opt; i++) {
-    cur_res_lsb_ct[tid] = (process_lsb) ? cur_lsb_ct[tid] : 0;
-    cur_res_msb_ct[tid] = (process_msb) ? cur_msb_ct[tid] : 0;
-    tid += params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0) {
-    cur_res_lsb_ct[params::degree] =
-        (process_lsb) ? cur_lsb_ct[params::degree] : 0;
-    cur_res_msb_ct[params::degree] =
-        (process_msb) ? cur_msb_ct[params::degree] : 0;
-  }
-}
-
-template <typename Torus, typename STorus, class params>
-__host__ void host_integer_mult_radix_kb(
-    cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
-    uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
-    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
-
-  auto glwe_dimension = mem_ptr->params.glwe_dimension;
-  auto polynomial_size = mem_ptr->params.polynomial_size;
-  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto message_modulus = mem_ptr->params.message_modulus;
-  auto carry_modulus = mem_ptr->params.carry_modulus;
-
-  int big_lwe_dimension = glwe_dimension * polynomial_size;
-  int big_lwe_size = big_lwe_dimension + 1;
-
-  // 'vector_result_lsb' contains blocks from all possible right shifts of
-  // radix_lwe_left, only nonzero blocks are kept
-  int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
-
-  // 'vector_result_msb' contains blocks from all possible shifts of
-  // radix_lwe_left except the last blocks of each shift. Only nonzero blocks
-  // are kept
-  int msb_vector_block_count = num_blocks * (num_blocks - 1) / 2;
-
-  // total number of blocks msb and lsb
-  int total_block_count = lsb_vector_block_count + msb_vector_block_count;
-
-  // buffer to keep all lsb and msb shifts
-  // for lsb all nonzero blocks of each right shifts are kept
-  // for 0 shift num_blocks blocks
-  // for 1 shift num_blocks - 1 blocks
-  // for num_blocks - 1 shift 1 block
-  // (num_blocks + 1) * num_blocks / 2 blocks
-  // for msb we don't keep track for last blocks so
-  // for 0 shift num_blocks - 1 blocks
-  // for 1 shift num_blocks - 2 blocks
-  // for num_blocks - 1 shift  0 blocks
-  // (num_blocks - 1) * num_blocks / 2 blocks
-  // in total num_blocks^2 blocks
-  // in each block three is big polynomial with
-  // glwe_dimension * polynomial_size + 1 coefficients
-  auto vector_result_sb = mem_ptr->vector_result_sb;
-
-  // buffer to keep lsb_vector + msb_vector
-  // addition will happen in full terms so there will be
-  // num_blocks terms and each term will have num_blocks block
-  // num_blocks^2 blocks in total
-  // and each blocks has big lwe ciphertext with
-  // glwe_dimension * polynomial_size + 1 coefficients
-  auto block_mul_res = mem_ptr->block_mul_res;
-
-  // buffer to keep keyswitch result of num_blocks^2 ciphertext
-  // in total it has num_blocks^2 small lwe ciphertexts with
-  // lwe_dimension +1 coefficients
-  auto small_lwe_vector = mem_ptr->small_lwe_vector;
-
-  // it contains two lut, first for lsb extraction,
-  // second for msb extraction, with total length =
-  // 2 * (glwe_dimension + 1) * polynomial_size
-  auto luts_array = mem_ptr->luts_array;
-
-  // accumulator to extract message
-  // with length (glwe_dimension + 1) * polynomial_size
-  auto luts_message = mem_ptr->luts_message;
-
-  // accumulator to extract carry
-  // with length (glwe_dimension + 1) * polynomial_size
-  auto luts_carry = mem_ptr->luts_carry;
-
-  // to be used as default indexing
-  auto lwe_indexes = luts_array->lwe_indexes;
-
-  auto vector_result_lsb = &vector_result_sb[0];
-  auto vector_result_msb =
-      &vector_result_sb[lsb_vector_block_count *
-                        (polynomial_size * glwe_dimension + 1)];
-
-  auto vector_lsb_rhs = &block_mul_res[0];
-  auto vector_msb_rhs = &block_mul_res[lsb_vector_block_count *
-                                       (polynomial_size * glwe_dimension + 1)];
-
-  dim3 grid(lsb_vector_block_count, 1, 1);
-  dim3 thds(params::degree / params::opt, 1, 1);
-
-  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
-      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
-      vector_lsb_rhs, vector_msb_rhs, num_blocks);
-
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
-      total_block_count, luts_array);
-
-  vector_result_lsb = &block_mul_res[0];
-  vector_result_msb = &block_mul_res[lsb_vector_block_count *
-                                     (polynomial_size * glwe_dimension + 1)];
-
-  fill_radix_from_lsb_msb<Torus, params>
-      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
-         stream->stream>>>(vector_result_sb, vector_result_lsb,
-                           vector_result_msb, glwe_dimension,
-                           lsb_vector_block_count, msb_vector_block_count,
-                           num_blocks);
-
-  auto new_blocks = block_mul_res;
-  auto old_blocks = vector_result_sb;
-
-  // amount of current radixes after block_mul
-  size_t r = 2 * num_blocks;
-
-  size_t total_modulus = message_modulus * carry_modulus;
-  size_t message_max = message_modulus - 1;
-  size_t chunk_size = (total_modulus - 1) / message_max;
-  size_t ch_amount = r / chunk_size;
-
-  int terms_degree[r * num_blocks];
-  int f_b[ch_amount];
-  int l_b[ch_amount];
-
-  for (int i = 0; i < num_blocks * num_blocks; i++) {
-    size_t r_id = i / num_blocks;
-    size_t b_id = i % num_blocks;
-    terms_degree[i] = (b_id >= r_id) ? 3 : 0;
-  }
-  auto terms_degree_msb = &terms_degree[num_blocks * num_blocks];
-  for (int i = 0; i < num_blocks * num_blocks; i++) {
-    size_t r_id = i / num_blocks;
-    size_t b_id = i % num_blocks;
-    terms_degree_msb[i] = (b_id > r_id) ? 2 : 0;
-  }
-
-  auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
-  while (r > chunk_size) {
-    int cur_total_blocks = r * num_blocks;
-    ch_amount = r / chunk_size;
-    dim3 add_grid(ch_amount, num_blocks, 1);
-    size_t sm_size = big_lwe_size * sizeof(Torus);
-    cuda_memset_async(new_blocks, 0,
-                      ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
-                      stream);
-
-    tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
-        new_blocks, old_blocks, chunk_size, num_blocks);
-
-    for (int c_id = 0; c_id < ch_amount; c_id++) {
-      auto cur_chunk = &terms_degree[c_id * chunk_size * num_blocks];
-      int mx = 0;
-      int mn = num_blocks;
-      for (int r_id = 1; r_id < chunk_size; r_id++) {
-        auto cur_radix = &cur_chunk[r_id * num_blocks];
-        for (int i = 0; i < num_blocks; i++) {
-          if (cur_radix[i]) {
-            mn = min(mn, i);
-            mx = max(mx, i);
-          }
-        }
-      }
-      f_b[c_id] = mn;
-      l_b[c_id] = mx;
-    }
-
-    int total_copied = 0;
-    int message_count = 0;
-    int carry_count = 0;
-    compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
-                                          l_b, num_blocks, ch_amount,
-                                          big_lwe_size, total_copied, true);
-
-    message_count = total_copied;
-    compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
-                                          l_b, num_blocks, ch_amount,
-                                          big_lwe_size, total_copied, false);
-    carry_count = total_copied - message_count;
-
-    auto message_blocks_vector = old_blocks;
-    auto carry_blocks_vector =
-        &old_blocks[message_count * (glwe_dimension * polynomial_size + 1)];
-
-    cuda_keyswitch_lwe_ciphertext_vector(
-        stream, small_lwe_vector, lwe_indexes, old_blocks, lwe_indexes, ksk,
-        polynomial_size * glwe_dimension, lwe_dimension,
-        mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
-
-    execute_pbs<Torus>(
-        stream, message_blocks_vector, lwe_indexes, luts_message->lut,
-        luts_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
-        luts_message->pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, mem_ptr->params.pbs_base_log,
-        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-        message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
-
-    execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
-                       luts_carry->lut, luts_carry->lut_indexes,
-                       &small_lwe_vector[message_count * (lwe_dimension + 1)],
-                       lwe_indexes, bsk, luts_carry->pbs_buffer, glwe_dimension,
-                       lwe_dimension, polynomial_size,
-                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-                       mem_ptr->params.grouping_factor, carry_count, 1, 0,
-                       max_shared_memory, mem_ptr->params.pbs_type);
-
-    int rem_blocks = r % chunk_size * num_blocks;
-    int new_blocks_created = 2 * ch_amount * num_blocks;
-    int copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
-
-    auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
-    auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
-    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
-
-    total_copied = 0;
-    int total_radix_copied = 0;
-    extract_message_carry_to_full_radix<Torus>(
-        stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
-        total_copied, total_radix_copied, num_blocks, true);
-    extract_message_carry_to_full_radix<Torus>(
-        stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
-        total_copied, total_radix_copied, num_blocks, false);
-
-    std::swap(new_blocks, old_blocks);
-    r = (new_blocks_created + rem_blocks) / num_blocks;
-  }
-
-  dim3 add_grid(1, num_blocks, 1);
-  size_t sm_size = big_lwe_size * sizeof(Torus);
-  cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
-                    stream);
-  tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
-      radix_lwe_out, old_blocks, r, num_blocks);
-
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
-      luts_message);
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
-      luts_carry);
-
-  cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
-
-  host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
-                big_lwe_size, num_blocks);
-
-  host_propagate_single_carry_low_latency<Torus>(
-      stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
-}
-
-template <typename Torus>
-__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
-    cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
-  *mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
-                                       allocate_gpu_memory);
-}
-
-// Function to apply lookup table,
-// It has two mode
-//  lsb_msb_mode == true - extracts lsb and msb
-//  lsb_msb_mode == false - extracts message and carry
-template <typename Torus, typename STorus, class params>
-void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
-                        int_mul_memory<Torus> *mem_ptr, uint32_t glwe_dimension,
-                        uint32_t lwe_dimension, uint32_t polynomial_size,
-                        uint32_t pbs_base_log, uint32_t pbs_level,
-                        uint32_t ks_base_log, uint32_t ks_level,
-                        uint32_t grouping_factor,
-                        uint32_t lsb_message_blocks_count,
-                        uint32_t msb_carry_blocks_count,
-                        uint32_t max_shared_memory, bool lsb_msb_mode) {
-
-  int total_blocks_count = lsb_message_blocks_count + msb_carry_blocks_count;
-  int gpu_n = mem_ptr->p2p_gpu_count;
-  if (total_blocks_count < gpu_n)
-    gpu_n = total_blocks_count;
-  int gpu_blocks_count = total_blocks_count / gpu_n;
-  int big_lwe_size = glwe_dimension * polynomial_size + 1;
-  //  int small_lwe_size = lwe_dimension + 1;
-
-#pragma omp parallel for num_threads(gpu_n)
-  for (int i = 0; i < gpu_n; i++) {
-    cudaSetDevice(i);
-    auto this_stream = mem_ptr->streams[i];
-    // Index where input and output blocks start for current gpu
-    int big_lwe_start_index = i * gpu_blocks_count * big_lwe_size;
-
-    // Last gpu might have extra blocks to process if total blocks number is not
-    // divisible by gpu_n
-    if (i == gpu_n - 1) {
-      gpu_blocks_count += total_blocks_count % gpu_n;
-    }
-
-    int can_access_peer;
-    cudaDeviceCanAccessPeer(&can_access_peer, i, 0);
-    if (i == 0) {
-      check_cuda_error(
-          cudaMemcpyAsync(mem_ptr->pbs_output_multi_gpu[i],
-                          &input_ciphertexts[big_lwe_start_index],
-                          gpu_blocks_count * big_lwe_size * sizeof(Torus),
-                          cudaMemcpyDeviceToDevice, *this_stream));
-    } else if (can_access_peer) {
-      check_cuda_error(cudaMemcpyPeerAsync(
-          mem_ptr->pbs_output_multi_gpu[i], i,
-          &input_ciphertexts[big_lwe_start_index], 0,
-          gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
-    } else {
-      // Uses host memory as middle ground
-      cuda_memcpy_async_to_cpu(mem_ptr->device_to_device_buffer[i],
-                               &input_ciphertexts[big_lwe_start_index],
-                               gpu_blocks_count * big_lwe_size * sizeof(Torus),
-                               this_stream, i);
-      cuda_memcpy_async_to_gpu(
-          mem_ptr->pbs_output_multi_gpu[i], mem_ptr->device_to_device_buffer[i],
-          gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
-    }
-
-    // when lsb and msb have to be extracted
-    //  for first lsb_count blocks we need lsb_acc
-    //  for last msb_count blocks we need msb_acc
-    // when message and carry have tobe extracted
-    //  for first message_count blocks we need message_acc
-    //  for last carry_count blocks we need carry_acc
-    Torus *cur_lut_indexes;
-    if (lsb_msb_mode) {
-      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
-                            ? mem_ptr->lut_indexes_lsb_multi_gpu[i]
-                            : mem_ptr->lut_indexes_msb_multi_gpu[i];
-
-    } else {
-      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
-                            ? mem_ptr->lut_indexes_message_multi_gpu[i]
-                            : mem_ptr->lut_indexes_carry_multi_gpu[i];
-    }
-
-    // execute keyswitch on a current gpu with corresponding input and output
-    // blocks pbs_output_multi_gpu[i] is an input for keyswitch and
-    // pbs_input_multi_gpu[i] is an output for keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector(
-        this_stream, i, mem_ptr->pbs_input_multi_gpu[i],
-        mem_ptr->pbs_output_multi_gpu[i], mem_ptr->ksk_multi_gpu[i],
-        polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
-        gpu_blocks_count);
-
-    // execute pbs on a current gpu with corresponding input and output
-    cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-        this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
-        mem_ptr->lut_multi_gpu[i], cur_lut_indexes,
-        mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
-        mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
-        polynomial_size, grouping_factor, pbs_base_log, pbs_level,
-        grouping_factor, gpu_blocks_count, 2, 0, max_shared_memory);
-
-    // lookup table is applied and now data from current gpu have to be copied
-    // back to gpu_0 in 'output_ciphertexts' buffer
-    if (i == 0) {
-      check_cuda_error(
-          cudaMemcpyAsync(&output_ciphertexts[big_lwe_start_index],
-                          mem_ptr->pbs_output_multi_gpu[i],
-                          gpu_blocks_count * big_lwe_size * sizeof(Torus),
-                          cudaMemcpyDeviceToDevice, *this_stream));
-    } else if (can_access_peer) {
-      check_cuda_error(cudaMemcpyPeerAsync(
-          &output_ciphertexts[big_lwe_start_index], 0,
-          mem_ptr->pbs_output_multi_gpu[i], i,
-          gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
-    } else {
-      // Uses host memory as middle ground
-      cuda_memcpy_async_to_cpu(
-          mem_ptr->device_to_device_buffer[i], mem_ptr->pbs_output_multi_gpu[i],
-          gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
-      cuda_memcpy_async_to_gpu(&output_ciphertexts[big_lwe_start_index],
-                               mem_ptr->device_to_device_buffer[i],
-                               gpu_blocks_count * big_lwe_size * sizeof(Torus),
-                               this_stream, i);
-    }
-  }
-}
-
-template <typename T>
-__global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
-                                                         T *input_lwe_array,
-                                                         T scalar,
-                                                         uint32_t lwe_dimension,
-                                                         uint32_t num_blocks) {
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int lwe_size = lwe_dimension + 1;
-  if (index < num_blocks * lwe_size) {
-    // Here we take advantage of the wrapping behaviour of uint
-    output_lwe_array[index] = input_lwe_array[index] * scalar;
-  }
-}
-
-template <typename T>
-__host__ void host_integer_small_scalar_mult_radix(
-    cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
-    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
-
-  cudaSetDevice(stream->gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = input_lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
-      output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
-      input_lwe_ciphertext_count);
-  check_cuda_error(cudaGetLastError());
-}
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -1,12 +0,0 @@
-#include "integer/negation.cuh"
-
-void cuda_negate_integer_radix_ciphertext_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
-    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus) {
-
-  host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
-                              static_cast<uint64_t *>(lwe_array), lwe_dimension,
-                              lwe_ciphertext_count, message_modulus,
-                              carry_modulus);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -1,79 +0,0 @@
-#ifndef CUDA_INTEGER_NEGATE_CUH
-#define CUDA_INTEGER_NEGATE_CUH
-
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#include "device.h"
-#include "integer.h"
-#include "utils/kernel_dimensions.cuh"
-
-template <typename Torus>
-__global__ void
-device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
-                              uint64_t lwe_dimension, uint64_t message_modulus,
-                              uint64_t carry_modulus, uint64_t delta) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < lwe_dimension + 1) {
-    bool is_body = (tid == lwe_dimension);
-
-    // z = ceil( degree / 2^p ) * 2^p
-    uint64_t z = (2 * message_modulus - 1) / message_modulus;
-    __syncthreads();
-    z *= message_modulus;
-
-    // (0,Delta*z) - ct
-    output[tid] = (is_body ? z * delta - input[tid] : -input[tid]);
-
-    for (int radix_block_id = 1; radix_block_id < num_blocks;
-         radix_block_id++) {
-      tid += (lwe_dimension + 1);
-
-      // Subtract z/B to the next ciphertext to compensate for the addition of z
-      uint64_t zb = z / message_modulus;
-
-      uint64_t encoded_zb = zb * delta;
-
-      __syncthreads();
-
-      // (0,Delta*z) - ct
-      output[tid] =
-          (is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
-      __syncthreads();
-    }
-  }
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
-                                          Torus *input, uint32_t lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count,
-                                          uint64_t message_modulus,
-                                          uint64_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
-
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-  uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
-      output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
-      carry_modulus, delta);
-  check_cuda_error(cudaGetLastError());
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -1,12 +0,0 @@
-#include "integer/scalar_addition.cuh"
-
-void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
-    cuda_stream_t *stream, void *lwe_array, void *scalar_input,
-    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
-
-  host_integer_radix_scalar_addition_inplace(
-      stream, static_cast<uint64_t *>(lwe_array),
-      static_cast<uint64_t *>(scalar_input), lwe_dimension,
-      lwe_ciphertext_count, message_modulus, carry_modulus);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -1,130 +0,0 @@
-#ifndef CUDA_INTEGER_ADD_CUH
-#define CUDA_INTEGER_ADD_CUH
-
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#include "device.h"
-#include "integer.h"
-#include "utils/kernel_dimensions.cuh"
-#include <stdio.h>
-
-template <typename Torus>
-__global__ void device_integer_radix_scalar_addition_inplace(
-    Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
-    uint32_t lwe_dimension, uint64_t delta) {
-
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num_blocks) {
-    Torus scalar = scalar_input[tid];
-    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
-
-    *body += scalar * delta;
-  }
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_addition_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
-    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
-
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
-                                                 stream->stream>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus>
-__global__ void device_integer_radix_add_scalar_one_inplace(
-    Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
-    uint64_t delta) {
-
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num_blocks) {
-    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
-    *body += delta;
-  }
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_add_scalar_one_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
-    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
-    uint32_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
-
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
-                                                stream->stream>>>(
-      lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus>
-__global__ void device_integer_radix_scalar_subtraction_inplace(
-    Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
-    uint32_t lwe_dimension, uint64_t delta) {
-
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num_blocks) {
-    Torus scalar = scalar_input[tid];
-    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
-
-    *body -= scalar * delta;
-  }
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_subtraction_inplace(
-    cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
-    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
-    uint32_t message_modulus, uint32_t carry_modulus) {
-  cudaSetDevice(stream->gpu_index);
-
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
-                                                    stream->stream>>>(
-      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
-      delta);
-  check_cuda_error(cudaGetLastError());
-}
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -1,14 +0,0 @@
-#include "integer/scalar_bitops.cuh"
-
-void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
-    void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
-    void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
-
-  host_integer_radix_scalar_bitop_kb<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_array_input),
-      static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
-      lwe_ciphertext_count, op);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -1,49 +0,0 @@
-#ifndef CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
-#define CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
-
-#include "integer/bitwise_ops.cuh"
-#include <omp.h>
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_bitop_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_input,
-    Torus *clear_blocks, uint32_t num_clear_blocks,
-    int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
-    uint32_t num_radix_blocks, BITOP_TYPE op) {
-
-  auto lut = mem_ptr->lut;
-  auto params = lut->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-
-  uint32_t lwe_size = big_lwe_dimension + 1;
-
-  if (num_clear_blocks == 0) {
-    if (op == SCALAR_BITAND) {
-      cuda_memset_async(lwe_array_out, 0,
-                        num_radix_blocks * lwe_size * sizeof(Torus), stream);
-    } else {
-      cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
-                                   num_radix_blocks * lwe_size * sizeof(Torus),
-                                   stream);
-    }
-  } else {
-    // We have all possible LUTs pre-computed and we use the decomposed scalar
-    // as index to recover the right one
-    cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
-                                 num_clear_blocks * sizeof(Torus), stream);
-
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
-        lut);
-
-    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
-      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
-      cuda_memset_async(lwe_array_out_block, 0,
-                        (num_radix_blocks - num_clear_blocks) * lwe_size *
-                            sizeof(Torus),
-                        stream);
-    }
-  }
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -1,41 +0,0 @@
-#include "integer/scalar_comparison.cuh"
-
-void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
-    void *scalar_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
-    uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
-
-  int_comparison_buffer<uint64_t> *buffer =
-      (int_comparison_buffer<uint64_t> *)mem_ptr;
-  switch (buffer->op) {
-  case EQ:
-  case NE:
-    host_integer_radix_scalar_equality_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
-        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
-    break;
-  case GT:
-  case GE:
-  case LT:
-  case LE:
-    host_integer_radix_scalar_difference_check_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
-        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer,
-        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
-        lwe_ciphertext_count, num_scalar_blocks);
-    break;
-  case MAX:
-  case MIN:
-    host_integer_radix_scalar_maxmin_kb<uint64_t>(
-        stream, static_cast<uint64_t *>(lwe_array_out),
-        static_cast<uint64_t *>(lwe_array_in),
-        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
-        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
-    break;
-  default:
-    PANIC("Cuda error: integer operation not supported");
-  }
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -1,400 +0,0 @@
-#ifndef CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
-#define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
-
-#include "integer/comparison.cuh"
-#include <omp.h>
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_difference_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
-    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  auto diff_buffer = mem_ptr->diff_buffer;
-
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  // Reducing the signs is the bottleneck of the comparison algorithms,
-  // however if the scalar case there is an improvement:
-  //
-  // The idea is to reduce the number of signs block we have to
-  // reduce. We can do that by splitting the comparison problem in two parts.
-  //
-  // - One part where we compute the signs block between the scalar with just
-  // enough blocks
-  //   from the ciphertext that can represent the scalar value
-  //
-  // - The other part is to compare the ciphertext blocks not considered for the
-  // sign
-  //   computation with zero, and create a single sign block from that.
-  //
-  // The smaller the scalar value is compared to the ciphertext num bits
-  // encrypted, the more the comparisons with zeros we have to do, and the less
-  // signs block we will have to reduce.
-  //
-  // This will create a speedup as comparing a bunch of blocks with 0
-  // is faster
-  if (total_num_scalar_blocks == 0) {
-    // We only have to compare blocks with zero
-    // means scalar is zero
-    host_compare_with_zero_equality(
-        stream, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsk, ksk,
-        total_num_radix_blocks, mem_ptr->is_zero_lut);
-
-    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
-      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
-
-      return sign_handler_f(x);
-    };
-
-    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, scalar_last_leaf_lut_f);
-
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
-
-    // The result will be in the two first block. Everything else is
-    //  garbage.
-    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                      big_lwe_size_bytes * (total_num_radix_blocks - 1),
-                      stream);
-
-  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
-    // We have to handle both part of the work described above
-
-    uint32_t num_lsb_radix_blocks = total_num_scalar_blocks;
-    uint32_t num_msb_radix_blocks =
-        total_num_radix_blocks - num_lsb_radix_blocks;
-
-    auto lsb = lwe_array_in;
-    auto msb = lwe_array_in + num_lsb_radix_blocks * big_lwe_size;
-
-    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
-    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
-
-    cuda_synchronize_stream(stream);
-    auto lsb_stream = mem_ptr->lsb_stream;
-    auto msb_stream = mem_ptr->msb_stream;
-
-#pragma omp parallel sections
-    {
-      // Both sections may be executed in parallel
-#pragma omp section
-      {
-        //////////////
-        // lsb
-        Torus *lhs = diff_buffer->tmp_packed_left;
-        Torus *rhs = diff_buffer->tmp_packed_right;
-
-        pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
-                    num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
-                    message_modulus);
-
-        // From this point we have half number of blocks
-        num_lsb_radix_blocks /= 2;
-        num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
-
-        // comparisons will be assigned
-        // - 0 if lhs < rhs
-        // - 1 if lhs == rhs
-        // - 2 if lhs > rhs
-
-        auto comparisons = mem_ptr->tmp_block_comparisons;
-        scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
-                                       mem_ptr, bsk, ksk, num_lsb_radix_blocks);
-
-        // Reduces a vec containing radix blocks that encrypts a sign
-        // (inferior, equal, superior) to one single radix block containing the
-        // final sign
-        tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
-                            mem_ptr->diff_buffer->tree_buffer,
-                            mem_ptr->cleaning_lut_f, bsk, ksk,
-                            num_lsb_radix_blocks);
-      }
-#pragma omp section
-      {
-        //////////////
-        // msb
-        host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        mem_ptr->is_zero_lut);
-      }
-    }
-    cuda_synchronize_stream(lsb_stream);
-    cuda_synchronize_stream(msb_stream);
-
-    //////////////
-    // Reduce the two blocks into one final
-
-    auto scalar_bivariate_last_leaf_lut_f =
-        [sign_handler_f](Torus lsb, Torus msb) -> Torus {
-      if (msb == 1)
-        return sign_handler_f(lsb);
-      else
-        return sign_handler_f(IS_SUPERIOR);
-    };
-
-    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator_bivariate<Torus>(
-        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-        carry_modulus, scalar_bivariate_last_leaf_lut_f);
-
-    integer_radix_apply_bivariate_lookup_table_kb(
-        stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
-        1, lut);
-
-    // The result will be in the first block. Everything else is garbage.
-    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
-                      stream);
-  } else {
-    // We only have to do the regular comparison
-    // And not the part where we compare most significant blocks with zeros
-    // total_num_radix_blocks == total_num_scalar_blocks
-    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
-    uint32_t num_scalar_blocks = total_num_scalar_blocks;
-
-    auto lsb = lwe_array_in;
-
-    Torus *lhs = diff_buffer->tmp_packed_left;
-    Torus *rhs = diff_buffer->tmp_packed_right;
-
-    pack_blocks(stream, lhs, lwe_array_in, big_lwe_dimension,
-                num_lsb_radix_blocks, message_modulus);
-    pack_blocks(stream, rhs, scalar_blocks, 0, num_scalar_blocks,
-                message_modulus);
-
-    // From this point we have half number of blocks
-    num_lsb_radix_blocks /= 2;
-    num_scalar_blocks /= 2;
-
-    // comparisons will be assigned
-    // - 0 if lhs < rhs
-    // - 1 if lhs == rhs
-    // - 2 if lhs > rhs
-    auto comparisons = mem_ptr->tmp_lwe_array_out;
-    scalar_compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk,
-                                   ksk, num_lsb_radix_blocks);
-
-    // Reduces a vec containing radix blocks that encrypts a sign
-    // (inferior, equal, superior) to one single radix block containing the
-    // final sign
-    tree_sign_reduction(stream, lwe_array_out, comparisons,
-                        mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
-                        ksk, num_lsb_radix_blocks);
-
-    // The result will be in the first block. Everything else is garbage.
-    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
-                      stream);
-  }
-}
-
-template <typename Torus>
-__host__ void
-scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
-                               Torus *lwe_array_in, Torus *scalar_blocks,
-                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-                               Torus *ksk, uint32_t num_radix_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
-
-  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
-  // be set to 1
-  // meaning that the output of the pbs will be the negative (modulo message
-  // space)
-  //
-  // Example:
-  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
-  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
-  // Since there was an overflow the bit of padding is 1 and not 0.
-  // When applying the LUT for an input value of 14 we would expect 1,
-  // but since the bit of padding is 1, we will get -1 modulus our message
-  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
-
-  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
-  cuda_memcpy_async_gpu_to_gpu(
-      subtracted_blocks, lwe_array_in,
-      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
-  // Subtract
-  // Here we need the true lwe sub, not the one that comes from shortint.
-  host_integer_radix_scalar_subtraction_inplace(
-      stream, subtracted_blocks, scalar_blocks, big_lwe_dimension,
-      num_radix_blocks, message_modulus, carry_modulus);
-
-  // Apply LUT to compare to 0
-  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
-                                                 subtracted_blocks, bsk, ksk,
-                                                 num_radix_blocks, sign_lut);
-
-  // Add one
-  // Here Lhs can have the following values: (-1) % (message modulus * carry
-  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
-                                            big_lwe_dimension, num_radix_blocks,
-                                            message_modulus, carry_modulus);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_maxmin_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t total_num_radix_blocks,
-    uint32_t total_num_scalar_blocks) {
-
-  auto params = mem_ptr->params;
-
-  // Calculates the difference sign between the ciphertext and the scalar
-  // - 0 if lhs < rhs
-  // - 1 if lhs == rhs
-  // - 2 if lhs > rhs
-  auto sign = mem_ptr->tmp_lwe_array_out;
-  host_integer_radix_scalar_difference_check_kb(
-      stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
-      mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks,
-      total_num_scalar_blocks);
-
-  // There is no optimized CMUX for scalars, so we convert to a trivial
-  // ciphertext
-  auto lwe_array_left = lwe_array_in;
-  auto lwe_array_right = mem_ptr->tmp_block_comparisons;
-
-  create_trivial_radix(stream, lwe_array_right, scalar_blocks,
-                       params.big_lwe_dimension, total_num_radix_blocks,
-                       total_num_scalar_blocks, params.message_modulus,
-                       params.carry_modulus);
-
-  // Selector
-  // CMUX for Max or Min
-  host_integer_radix_cmux_kb(
-      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_scalar_equality_check_kb(
-    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
-    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
-    Torus *ksk, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
-
-  auto params = mem_ptr->params;
-  auto big_lwe_dimension = params.big_lwe_dimension;
-  auto message_modulus = params.message_modulus;
-
-  auto eq_buffer = mem_ptr->eq_buffer;
-
-  size_t big_lwe_size = big_lwe_dimension + 1;
-  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-
-  auto scalar_comparison_luts = eq_buffer->scalar_comparison_luts;
-
-  uint32_t num_halved_scalar_blocks =
-      (num_scalar_blocks / 2) + (num_scalar_blocks % 2);
-
-  uint32_t num_lsb_radix_blocks =
-      std::min(num_radix_blocks, 2 * num_halved_scalar_blocks);
-  uint32_t num_msb_radix_blocks = num_radix_blocks - num_lsb_radix_blocks;
-  uint32_t num_halved_lsb_radix_blocks =
-      (num_lsb_radix_blocks / 2) + (num_lsb_radix_blocks % 2);
-
-  auto lsb = lwe_array_in;
-  auto msb = lwe_array_in + big_lwe_size * num_lsb_radix_blocks;
-
-  auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
-  auto lwe_array_msb_out =
-      lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;
-
-  cuda_synchronize_stream(stream);
-
-  auto lsb_stream = mem_ptr->lsb_stream;
-  auto msb_stream = mem_ptr->msb_stream;
-
-#pragma omp parallel sections
-  {
-    // Both sections may be executed in parallel
-#pragma omp section
-    {
-      if (num_halved_scalar_blocks > 0) {
-        auto packed_blocks = mem_ptr->tmp_packed_input;
-        auto packed_scalar =
-            packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
-
-        pack_blocks(lsb_stream, packed_blocks, lsb, big_lwe_dimension,
-                    num_lsb_radix_blocks, message_modulus);
-        pack_blocks(lsb_stream, packed_scalar, scalar_blocks, 0,
-                    num_scalar_blocks, message_modulus);
-
-        cuda_memcpy_async_gpu_to_gpu(
-            scalar_comparison_luts->lut_indexes, packed_scalar,
-            num_halved_scalar_blocks * sizeof(Torus), lsb_stream);
-
-        integer_radix_apply_univariate_lookup_table_kb(
-            lsb_stream, lwe_array_lsb_out, packed_blocks, bsk, ksk,
-            num_halved_lsb_radix_blocks, scalar_comparison_luts);
-      }
-    }
-#pragma omp section
-    {
-      //////////////
-      // msb
-      if (num_msb_radix_blocks > 0) {
-        int_radix_lut<Torus> *msb_lut;
-        switch (mem_ptr->op) {
-        case COMPARISON_TYPE::EQ:
-          msb_lut = mem_ptr->is_zero_lut;
-          break;
-        case COMPARISON_TYPE::NE:
-          msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-          break;
-        default:
-          PANIC("Cuda error: integer operation not supported");
-        }
-
-        host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
-                                        mem_ptr, bsk, ksk, num_msb_radix_blocks,
-                                        msb_lut);
-      }
-    }
-  }
-
-  cuda_synchronize_stream(lsb_stream);
-  cuda_synchronize_stream(msb_stream);
-
-  switch (mem_ptr->op) {
-  case COMPARISON_TYPE::EQ:
-    are_all_comparisons_block_true(
-        stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
-        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
-    break;
-  case COMPARISON_TYPE::NE:
-    is_at_least_one_comparisons_block_true(
-        stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
-        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
-    break;
-  default:
-    PANIC("Cuda error: integer operation not supported");
-  }
-
-  // The result will be in the two first block. Everything else is
-  //  garbage.
-  if (num_radix_blocks > 1)
-    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                      big_lwe_size_bytes * (num_radix_blocks - 1), stream);
-}
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -1,40 +0,0 @@
-#include "scalar_rotate.cuh"
-
-void scratch_cuda_integer_radix_scalar_rotate_kb_64(
-    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t big_lwe_dimension,
-    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
-      stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
-      shift_type, allocate_gpu_memory);
-}
-
-void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
-                                                    void *lwe_array, uint32_t n,
-                                                    int8_t *mem_ptr, void *bsk,
-                                                    void *ksk,
-                                                    uint32_t num_blocks) {
-
-  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
-      stream, static_cast<uint64_t *>(lwe_array), n,
-      (int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
-      num_blocks);
-}
-
-void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
-                                              int8_t **mem_ptr_void) {
-
-  int_shift_buffer<uint64_t> *mem_ptr =
-      (int_shift_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(stream);
-}
--- a/Show More
+++ b/Show More