chore(ci): spawn ec2 instance on-demand for code-coverage

chore(ci): speed-up core_crypto code coverage execution duration
feat(gpu): add compatibility with cuda from package managers
2026-01-11 15:48:20 -05:00 · 2024-05-02 15:41:33 +02:00 · 2024-05-02 15:40:09 +02:00 · 2024-05-02 10:14:22 +02:00 · 2024-04-30 16:50:18 +02:00 · 2024-04-30 15:53:02 +02:00
171 changed files with 10494 additions and 1619 deletions
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -45,14 +45,14 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -29,10 +29,10 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -36,8 +36,8 @@ jobs:
          job-secret: ${{ secrets.JOB_SECRET }}
          profile: gpu-test

-  cuda-tests-linux:
-    name: CUDA tests
+  cuda-pcc:
+    name: CUDA post-commit checks
    needs: setup-ec2
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
@@ -56,14 +56,14 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

@@ -94,6 +94,64 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  cuda-tests-linux:
+    name: CUDA tests
+    needs: [ setup-ec2, cuda-pcc ]
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu
@@ -121,7 +179,7 @@ jobs:
  teardown-ec2:
    name: Teardown EC2 instance (cuda-tests)
    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
-    needs: [ setup-ec2, cuda-tests-linux ]
+    needs: [ setup-ec2, cuda-pcc, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -46,14 +46,14 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -46,14 +46,14 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -46,14 +46,14 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -46,14 +46,14 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -97,13 +97,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -23,7 +23,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Install and run newline linter checks
        if: matrix.os == 'ubuntu-latest'
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Get actionlint
        run: |
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -6,70 +6,58 @@ env:
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-    # All the inputs are provided by Slab
-    inputs:
-      instance_id:
-        description: "AWS instance ID"
-        type: string
-      instance_image_id:
-        description: "AWS instance AMI ID"
-        type: string
-      instance_type:
-        description: "AWS instance product type"
-        type: string
-      runner_name:
-        description: "Action runner name"
-        type: string
-      request_id:
-        description: 'Slab request ID'
-        type: string
-      fork_repo:
-        description: 'Name of forked repo as user/repo'
-        type: string
-      fork_git_sha:
-        description: 'Git SHA to checkout from fork'
-        type: string

 jobs:
+  setup-ec2:
+    name: Setup EC2 instance (code-coverage)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+      instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
+      aws-region: ${{ steps.start-instance.outputs.aws-region }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          profile: cpu-small
+
  code-coverage:
+    name: Code coverage
+    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
-    runs-on: ${{ inputs.runner_name }}
+    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    timeout-minutes: 11520 # 8 days
    steps:
-      # Step used for log purpose.
-      - name: Instance configuration used
-        run: |
-          echo "ID: ${{ inputs.instance_id }}"
-          echo "AMI: ${{ inputs.instance_image_id }}"
-          echo "Type: ${{ inputs.instance_type }}"
-          echo "Request ID: ${{ inputs.request_id }}"
-          echo "Fork repo: ${{ inputs.fork_repo }}"
-          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
-
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
-        with:
-          repository: ${{ inputs.fork_repo }}
-          ref: ${{ inputs.fork_git_sha }}
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
+        uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
        with:
          files_yaml: |
            tfhe:
@@ -78,29 +66,29 @@ jobs:
              - concrete-csprng/src/**

      - name: Generate Keys
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+#        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        run: |
          make GEN_KEY_CACHE_COVERAGE_ONLY=TRUE gen_key_cache
          make gen_key_cache_core_crypto

      - name: Run coverage for core_crypto
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+#        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        run: |
          make test_core_crypto_cov AVX512_SUPPORT=ON

      - name: Run coverage for boolean
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+#        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        run: |
          make test_boolean_cov

      - name: Run coverage for shortint
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+#        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        run: |
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+        uses: codecov/codecov-action@84508663e988701840491b86de86b666e8a86bed
+#        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          directory: ./coverage/
@@ -108,13 +96,13 @@ jobs:
          files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml

      - name: Run integer coverage
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+#        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        run: |
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7afa10ed9b269c561c2336fd862446844e0cbf71
-        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+        uses: codecov/codecov-action@84508663e988701840491b86de86b666e8a86bed
+#        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          directory: ./coverage/
@@ -127,8 +115,29 @@ jobs:
        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+  teardown-ec2:
+    name: Teardown EC2 instance (code-coverage)
+    if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
+    needs: [ setup-ec2, code-coverage ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          region: ${{ needs.setup-ec2.outputs.aws-region }}
+          label: ${{ needs.setup-ec2.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "EC2 teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -88,13 +88,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -61,7 +61,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -71,7 +71,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -118,13 +118,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -47,14 +47,14 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -39,7 +39,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -52,12 +52,12 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
@@ -81,7 +81,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -120,7 +120,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -133,12 +133,12 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
@@ -163,7 +163,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -194,7 +194,7 @@ jobs:
    name: Remove 4090 bench label
    if: ${{ always() && github.event_name == 'pull_request' }}
    needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
-    runs-on: ["self-hosted", "4090-desktop"]
+    runs-on: ubuntu-latest
    steps:
      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
        with:
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -74,7 +74,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -92,12 +92,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
@@ -121,7 +121,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -56,7 +56,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -66,7 +66,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -100,7 +100,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -122,13 +122,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -64,7 +64,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -82,7 +82,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -107,7 +107,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
@@ -132,7 +132,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -57,7 +57,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -67,7 +67,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -101,7 +101,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -123,13 +123,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -31,10 +31,10 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+      - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

@@ -86,6 +86,13 @@ jobs:
        run: |
          make test_boolean

+      # Because we do "illegal" things with the build system which Cargo does not seem to like much
+      # we need to clear the cache to make sure the C API is built properly and does not use a stale
+      # cached version
+      - name: Clear build cache
+        run: |
+          cargo clean
+
      - name: Run C API tests
        run: |
          make test_c_api
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -30,7 +30,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

--- a/.github/workflows/make_release_concrete_csprng.yml
+++ b/.github/workflows/make_release_concrete_csprng.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -55,7 +55,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -64,7 +64,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: stable

--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -17,10 +17,10 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b

      - name: Checkout lattice-estimator
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -45,7 +45,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -55,7 +55,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -89,13 +89,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_shortint
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -71,12 +71,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
@@ -115,7 +115,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -52,7 +52,7 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -70,12 +70,12 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
@@ -99,7 +99,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -46,7 +46,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -56,7 +56,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -70,7 +70,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -91,13 +91,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -58,13 +58,13 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@2d756ea4c53f7f6b397767d8723b3a10a9f35bf2
+        uses: tj-actions/changed-files@0874344d6ebbaa00a27da73276ae7162fadcaf69
        with:
          files_yaml: |
            common_benches:
@@ -111,7 +111,7 @@ jobs:
              - .github/workflows/wasm_client_benchmark.yml

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -30,12 +30,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -13,11 +13,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0
      - name: Save repo
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: repo-archive
          path: '.'
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -53,7 +53,7 @@ jobs:
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          fetch-depth: 0

@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@dc6353516c68da0f06325f42ad880f76a5e77ec9
+        uses: dtolnay/rust-toolchain@bb45937a053e097f8591208d8e74c90db1873d07
        with:
          toolchain: nightly

@@ -98,13 +98,13 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633
+        uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b
        with:
          repository: zama-ai/slab
          path: slab
--- a/37
+++ b/37
@@ -271,7 +271,7 @@ clippy_js_wasm_api: install_rs_check_toolchain
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
-clippy_tasks:
+clippy_tasks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		-p tasks -- --no-deps -D warnings

@@ -281,19 +281,19 @@ clippy_trivium: install_rs_check_toolchain
 		-p tfhe-trivium -- --no-deps -D warnings

 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
-clippy_all_targets:
+clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok-experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
-clippy_concrete_csprng:
+clippy_concrete_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=$(TARGET_ARCH_FEATURE) \
 		-p concrete-csprng -- --no-deps -D warnings

 .PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
-clippy_zk_pok:
+clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok -- --no-deps -D warnings

@@ -376,7 +376,7 @@ build_c_api_gpu: install_rs_check_toolchain
 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok-experimental,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
 	@"$(MAKE)" symlink_c_libs_without_fingerprint

@@ -424,13 +424,6 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
 		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
 		-p $(TFHE_SPEC) -- core_crypto::
-	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
-		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
-			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
-			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
-			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
-	fi

 .PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
 test_cuda_backend:
@@ -444,14 +437,14 @@ test_cuda_backend:
 test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
+test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
+test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
@@ -479,14 +472,14 @@ test_c_api_rs: install_rs_check_toolchain

 .PHONY: test_c_api_c # Run the C tests for the C API
 test_c_api_c: build_c_api
-	./scripts/c_api_tests.sh
+	./scripts/c_api_tests.sh --cargo-profile "$(CARGO_PROFILE)"

 .PHONY: test_c_api # Run all the tests for the C API
 test_c_api: test_c_api_rs test_c_api_c

 .PHONY: test_c_api_gpu # Run the C tests for the C API
 test_c_api_gpu: build_c_api_gpu
-	./scripts/c_api_tests.sh --gpu
+	./scripts/c_api_tests.sh --gpu --cargo-profile "$(CARGO_PROFILE)"

 .PHONY: test_shortint_ci # Run the tests for shortint ci
 test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
@@ -638,12 +631,12 @@ test_kreyvium: install_rs_build_toolchain
 		-p tfhe-trivium -- --test-threads=1 kreyvium::

 .PHONY: test_concrete_csprng # Run concrete-csprng tests
-test_concrete_csprng:
+test_concrete_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE) -p concrete-csprng

 .PHONY: test_zk_pok # Run tfhe-zk-pok-experimental tests
-test_zk_pok:
+test_zk_pok: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-zk-pok

@@ -683,14 +676,14 @@ check_md_docs_are_tested:
 	RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested

 .PHONY: check_compile_tests # Build tests in debug without running them
-check_compile_tests:
+check_compile_tests: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
 		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC)

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
 		"$(MAKE)" build_c_api && \
-		./scripts/c_api_tests.sh --build-only; \
+		./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
 	fi

 .PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
@@ -813,8 +806,6 @@ bench_oprf: install_rs_check_toolchain
 	--bench oprf-integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

-
-
 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -823,7 +814,6 @@ bench_shortint_multi_bit: install_rs_check_toolchain
 	--bench shortint-bench \
 	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

-
 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -867,6 +857,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
 #
 # Utility tools
 #
+
 .PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
 gen_key_cache: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -71,7 +71,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
 }

 fn main() {
-	let config = ConfigBuilder::all_disabled().enable_default_bool().build();
+	let config = ConfigBuilder::default().build();
 	let (client_key, server_key) = generate_keys(config);

 	let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -143,7 +143,7 @@ use tfhe::prelude::*;
 use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
-	let config = ConfigBuilder::all_disabled().enable_default_integers().build();
+	let config = ConfigBuilder::default().build();
 	let (hl_client_key, hl_server_key) = generate_keys(config);
 	let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS);
 	let ksk = CastingKey::new((&client_key, &server_key), (&hl_client_key, &hl_server_key));
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -13,6 +13,7 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]

 [build-dependencies]
 cmake = { version = "0.1" }
+pkg-config = { version = "0.3" }

 [dependencies]
 thiserror = "1.0"
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -21,7 +21,15 @@ fn main() {
        let dest = cmake::build("cuda");
        println!("cargo:rustc-link-search=native={}", dest.display());
        println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
-        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+
+        // Try to find the cuda libs with pkg-config, default to the path used by the nvidia runfile
+        if pkg_config::Config::new()
+            .atleast_version("10")
+            .probe("cuda")
+            .is_err()
+        {
+            println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        }
        println!("cargo:rustc-link-lib=gomp");
        println!("cargo:rustc-link-lib=cudart");
        println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -8,7 +8,6 @@
 #include <cuda_runtime.h>

 #define synchronize_threads_in_block() __syncthreads()
-
 extern "C" {

 #define check_cuda_error(ans)                                                  \
@@ -57,6 +56,8 @@ void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

 bool cuda_check_support_cooperative_groups();

+bool cuda_check_support_thread_block_clusters();
+
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cuda_stream_t *stream);

--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -39,6 +39,22 @@ enum COMPARISON_TYPE {
 enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };

 extern "C" {
+void scratch_cuda_apply_univariate_lut_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, void *input_lut,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);
+
+void cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
+                                     void *output_radix_lwe,
+                                     void *input_radix_lwe, int8_t *mem_ptr,
+                                     void *ksk, void *bsk, uint32_t num_blocks);
+
+void cleanup_cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
+                                             int8_t **mem_ptr_void);
+
 void scratch_cuda_full_propagation_64(
    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -277,7 +293,23 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(

 void cleanup_cuda_integer_radix_scalar_mul(cuda_stream_t *stream,
                                           int8_t **mem_ptr_void);
-}
+
+void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+
+void cuda_integer_div_rem_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
+    void *divisor, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t num_blocks_in_radix);
+
+void cleanup_cuda_integer_div_rem(cuda_stream_t *stream, int8_t **mem_ptr_void);
+
+} // extern C

 template <typename Torus>
 __global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
@@ -304,6 +336,11 @@ void generate_device_accumulator_bivariate(
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
    std::function<Torus(Torus, Torus)> f);

+template <typename Torus>
+void generate_device_accumulator_bivariate_with_factor(
+    cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus, Torus)> f, int factor);
 /*
 *  generate univariate accumulator (lut) for device pointer
 *    v_stream - cuda stream
@@ -966,7 +1003,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
  Torus *old_blocks;
  Torus *small_lwe_vector;
  int_radix_params params;
-  int_radix_lut<Torus> *luts_message_carry;
  int_sc_prop_memory<Torus> *scp_mem;

  int32_t *d_smart_copy_in;
@@ -979,10 +1015,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
                                 uint32_t max_num_radix_in_vec,
                                 bool allocate_gpu_memory) {
    this->params = params;
-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
-    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    // create single carry propagation memory object
    scp_mem = new int_sc_prop_memory<Torus>(stream, params, num_blocks_in_radix,
@@ -1002,43 +1034,15 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
        (int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
    d_smart_copy_out =
        (int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
-
-    // create lut object for message and carry
-    luts_message_carry = new int_radix_lut<Torus>(
-        stream, params, 2, max_pbs_count, allocate_gpu_memory);
-
-    auto message_acc = luts_message_carry->get_lut(0);
-    auto carry_acc = luts_message_carry->get_lut(1);
-
-    // define functions for each accumulator
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    // generate accumulators
-    generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, lut_f_message);
-    generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, lut_f_carry);
  }

  int_sum_ciphertexts_vec_memory(cuda_stream_t *stream, int_radix_params params,
                                 uint32_t num_blocks_in_radix,
                                 uint32_t max_num_radix_in_vec,
                                 Torus *new_blocks, Torus *old_blocks,
-                                 Torus *small_lwe_vector,
-                                 int_radix_lut<Torus> *base_lut_object) {
+                                 Torus *small_lwe_vector) {
    mem_reuse = true;
    this->params = params;
-    auto glwe_dimension = params.glwe_dimension;
-    auto polynomial_size = params.polynomial_size;
-    auto message_modulus = params.message_modulus;
-    auto carry_modulus = params.carry_modulus;

    // create single carry propagation memory object
    scp_mem = new int_sc_prop_memory<Torus>(stream, params, num_blocks_in_radix,
@@ -1054,29 +1058,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
        (int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
    d_smart_copy_out =
        (int32_t *)cuda_malloc_async(max_pbs_count * sizeof(int32_t), stream);
-
-    // create lut object for message and carry
-    luts_message_carry = new int_radix_lut<Torus>(
-        stream, params, 2, max_pbs_count, base_lut_object);
-
-    auto message_acc = luts_message_carry->get_lut(0);
-    auto carry_acc = luts_message_carry->get_lut(1);
-
-    // define functions for each accumulator
-    auto lut_f_message = [message_modulus](Torus x) -> Torus {
-      return x % message_modulus;
-    };
-    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-      return x / message_modulus;
-    };
-
-    // generate accumulators
-    generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, lut_f_message);
-    generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, lut_f_carry);
  }

  void release(cuda_stream_t *stream) {
@@ -1090,10 +1071,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    }

    scp_mem->release(stream);
-    luts_message_carry->release(stream);

    delete scp_mem;
-    delete luts_message_carry;
  }
 };

@@ -1220,7 +1199,7 @@ template <typename Torus> struct int_mul_memory {
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        stream, params, num_radix_blocks, 2 * num_radix_blocks, block_mul_res,
-        vector_result_sb, small_lwe_vector, luts_array);
+        vector_result_sb, small_lwe_vector);
  }

  void release(cuda_stream_t *stream) {
@@ -2156,6 +2135,327 @@ template <typename Torus> struct int_comparison_buffer {
  }
 };

+template <typename Torus> struct int_div_rem_memory {
+  int_radix_params params;
+  bool mem_reuse = false;
+
+  // memory objects for other operations
+  int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
+  int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
+  int_overflowing_sub_memory<Torus> *overflow_sub_mem;
+  int_comparison_buffer<Torus> *comparison_buffer;
+
+  // lookup tables
+  int_radix_lut<Torus> **masking_luts_1;
+  int_radix_lut<Torus> **masking_luts_2;
+  int_radix_lut<Torus> *message_extract_lut_1;
+  int_radix_lut<Torus> *message_extract_lut_2;
+  int_radix_lut<Torus> **zero_out_if_overflow_did_not_happen;
+  int_radix_lut<Torus> **zero_out_if_overflow_happened;
+  int_radix_lut<Torus> **merge_overflow_flags_luts;
+
+  // sub streams
+  cuda_stream_t *sub_stream_1;
+  cuda_stream_t *sub_stream_2;
+  cuda_stream_t *sub_stream_3;
+  cuda_stream_t *sub_stream_4;
+
+  // temporary device buffers
+  Torus *remainder1;
+  Torus *remainder2;
+  Torus *numerator_block_stack;
+  Torus *numerator_block_1;
+  Torus *tmp_radix;
+  Torus *interesting_remainder1;
+  Torus *interesting_remainder2;
+  Torus *interesting_divisor;
+  Torus *divisor_ms_blocks;
+  Torus *new_remainder;
+  Torus *subtraction_overflowed;
+  Torus *did_not_overflow;
+  Torus *overflow_sum;
+  Torus *overflow_sum_radix;
+  Torus *tmp_1;
+  Torus *at_least_one_upper_block_is_non_zero;
+  Torus *cleaned_merged_interesting_remainder;
+
+  // allocate and initialize if needed, temporary arrays used to calculate
+  // cuda integer div_rem operation
+  void init_temporary_buffers(cuda_stream_t *stream, uint32_t num_blocks) {
+    uint32_t big_lwe_size = params.big_lwe_dimension + 1;
+
+    // non boolean temporary arrays, with `num_blocks` blocks
+    remainder1 = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    remainder2 = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    numerator_block_stack = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    interesting_remainder2 = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    interesting_divisor = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    divisor_ms_blocks = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    new_remainder = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    cleaned_merged_interesting_remainder = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    tmp_1 = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+
+    // temporary arrays used as stacks
+    tmp_radix = (Torus *)cuda_malloc_async(
+        big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream);
+    interesting_remainder1 = (Torus *)cuda_malloc_async(
+        big_lwe_size * (num_blocks + 1) * sizeof(Torus), stream);
+    numerator_block_1 =
+        (Torus *)cuda_malloc_async(big_lwe_size * 2 * sizeof(Torus), stream);
+
+    // temporary arrays for boolean blocks
+    subtraction_overflowed =
+        (Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
+    did_not_overflow =
+        (Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
+    overflow_sum =
+        (Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
+    overflow_sum_radix = (Torus *)cuda_malloc_async(
+        big_lwe_size * num_blocks * sizeof(Torus), stream);
+    at_least_one_upper_block_is_non_zero =
+        (Torus *)cuda_malloc_async(big_lwe_size * 1 * sizeof(Torus), stream);
+  }
+
+  // initialize lookup tables for div_rem operation
+  void init_lookup_tables(cuda_stream_t *stream, uint32_t num_blocks) {
+    uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
+
+    // create and generate masking_luts_1[] and masking_lut_2[]
+    // both of them are equal but because they are used in two different
+    // executions in parallel we need two different pbs_buffers.
+    masking_luts_1 = new int_radix_lut<Torus> *[params.message_modulus - 1];
+    masking_luts_2 = new int_radix_lut<Torus> *[params.message_modulus - 1];
+    for (int i = 0; i < params.message_modulus - 1; i++) {
+      uint32_t shifted_mask = i;
+      std::function<Torus(Torus)> lut_f_masking =
+          [shifted_mask](Torus x) -> Torus { return x & shifted_mask; };
+
+      masking_luts_1[i] =
+          new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+      masking_luts_2[i] =
+          new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+
+      Torus *luts[2] = {masking_luts_1[i]->lut, masking_luts_2[i]->lut};
+
+      for (int j = 0; j < 2; j++) {
+        generate_device_accumulator<Torus>(
+            stream, luts[j], params.glwe_dimension, params.polynomial_size,
+            params.message_modulus, params.carry_modulus, lut_f_masking);
+      }
+    }
+
+    // create and generate message_extract_lut_1 and message_extract_lut_2
+    // both of them are equal but because they are used in two different
+    // executions in parallel we need two different pbs_buffers.
+    message_extract_lut_1 =
+        new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+    message_extract_lut_2 =
+        new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+
+    auto message_modulus = params.message_modulus;
+    auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
+      return x % message_modulus;
+    };
+
+    Torus *luts[2] = {message_extract_lut_1->lut, message_extract_lut_2->lut};
+    for (int j = 0; j < 2; j++) {
+      generate_device_accumulator<Torus>(
+          stream, luts[j], params.glwe_dimension, params.polynomial_size,
+          params.message_modulus, params.carry_modulus, lut_f_message_extract);
+    }
+
+    // Give name to closures to improve readability
+    auto overflow_happened = [](uint64_t overflow_sum) {
+      return overflow_sum != 0;
+    };
+    auto overflow_did_not_happen = [&overflow_happened](uint64_t overflow_sum) {
+      return !overflow_happened(overflow_sum);
+    };
+
+    // create and generate zero_out_if_overflow_did_not_happen
+    zero_out_if_overflow_did_not_happen = new int_radix_lut<Torus> *[2];
+    zero_out_if_overflow_did_not_happen[0] =
+        new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+    zero_out_if_overflow_did_not_happen[1] =
+        new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+
+    auto cur_lut_f = [&](Torus block, Torus overflow_sum) -> Torus {
+      if (overflow_did_not_happen(overflow_sum)) {
+        return 0;
+      } else {
+        return block;
+      }
+    };
+
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        stream, zero_out_if_overflow_did_not_happen[0]->lut,
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, cur_lut_f, 2);
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        stream, zero_out_if_overflow_did_not_happen[1]->lut,
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, cur_lut_f, 3);
+
+    // create and generate zero_out_if_overflow_happened
+    zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
+    zero_out_if_overflow_happened[0] =
+        new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+    zero_out_if_overflow_happened[1] =
+        new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+
+    auto overflow_happened_f = [&](Torus block, Torus overflow_sum) -> Torus {
+      if (overflow_happened(overflow_sum)) {
+        return 0;
+      } else {
+        return block;
+      }
+    };
+
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        stream, zero_out_if_overflow_happened[0]->lut, params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        overflow_happened_f, 2);
+    generate_device_accumulator_bivariate_with_factor<Torus>(
+        stream, zero_out_if_overflow_happened[1]->lut, params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        overflow_happened_f, 3);
+
+    // merge_overflow_flags_luts
+    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
+    for (int i = 0; i < num_bits_in_message; i++) {
+      auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
+        return (x == 0 && y == 0) << i;
+      };
+
+      merge_overflow_flags_luts[i] =
+          new int_radix_lut<Torus>(stream, params, 1, num_blocks, true);
+
+      generate_device_accumulator_bivariate<Torus>(
+          stream, merge_overflow_flags_luts[i]->lut, params.glwe_dimension,
+          params.polynomial_size, params.message_modulus, params.carry_modulus,
+          lut_f_bit);
+    }
+  }
+
+  int_div_rem_memory(cuda_stream_t *stream, int_radix_params params,
+                     uint32_t num_blocks, bool allocate_gpu_memory) {
+    this->params = params;
+    shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
+        stream, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true);
+
+    shift_mem_2 = new int_logical_scalar_shift_buffer<Torus>(
+        stream, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks, true);
+
+    overflow_sub_mem =
+        new int_overflowing_sub_memory<Torus>(stream, params, num_blocks, true);
+
+    comparison_buffer = new int_comparison_buffer<Torus>(
+        stream, COMPARISON_TYPE::NE, params, num_blocks, false, true);
+
+    init_lookup_tables(stream, num_blocks);
+    init_temporary_buffers(stream, num_blocks);
+
+    sub_stream_1 = new cuda_stream_t(stream->gpu_index);
+    sub_stream_2 = new cuda_stream_t(stream->gpu_index);
+    sub_stream_3 = new cuda_stream_t(stream->gpu_index);
+    sub_stream_4 = new cuda_stream_t(stream->gpu_index);
+  }
+
+  void release(cuda_stream_t *stream) {
+    uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
+
+    // release and delete other operation memory objects
+    shift_mem_1->release(stream);
+    shift_mem_2->release(stream);
+    overflow_sub_mem->release(stream);
+    comparison_buffer->release(stream);
+    delete shift_mem_1;
+    delete shift_mem_2;
+    delete overflow_sub_mem;
+    delete comparison_buffer;
+
+    // release and delete lookup tables
+
+    // masking_luts_1 and masking_luts_2
+    for (int i = 0; i < params.message_modulus - 1; i++) {
+      masking_luts_1[i]->release(stream);
+      masking_luts_2[i]->release(stream);
+
+      delete masking_luts_1[i];
+      delete masking_luts_2[i];
+    }
+    delete[] masking_luts_1;
+    delete[] masking_luts_2;
+
+    // message_extract_lut_1 and message_extract_lut_2
+    message_extract_lut_1->release(stream);
+    message_extract_lut_2->release(stream);
+
+    delete message_extract_lut_1;
+    delete message_extract_lut_2;
+
+    // zero_out_if_overflow_did_not_happen
+    zero_out_if_overflow_did_not_happen[0]->release(stream);
+    zero_out_if_overflow_did_not_happen[1]->release(stream);
+
+    delete zero_out_if_overflow_did_not_happen[0];
+    delete zero_out_if_overflow_did_not_happen[1];
+
+    delete[] zero_out_if_overflow_did_not_happen;
+
+    // zero_out_if_overflow_happened
+    zero_out_if_overflow_happened[0]->release(stream);
+    zero_out_if_overflow_happened[1]->release(stream);
+
+    delete zero_out_if_overflow_happened[0];
+    delete zero_out_if_overflow_happened[1];
+
+    delete[] zero_out_if_overflow_happened;
+
+    // merge_overflow_flags_luts
+    for (int i = 0; i < num_bits_in_message; i++) {
+      merge_overflow_flags_luts[i]->release(stream);
+
+      delete merge_overflow_flags_luts[i];
+    }
+    delete[] merge_overflow_flags_luts;
+
+    // release sub streams
+    sub_stream_1->release();
+    sub_stream_2->release();
+    sub_stream_3->release();
+    sub_stream_4->release();
+
+    // drop temporary buffers
+    cuda_drop_async(remainder1, stream);
+    cuda_drop_async(remainder2, stream);
+    cuda_drop_async(numerator_block_stack, stream);
+    cuda_drop_async(numerator_block_1, stream);
+    cuda_drop_async(tmp_radix, stream);
+    cuda_drop_async(interesting_remainder1, stream);
+    cuda_drop_async(interesting_remainder2, stream);
+    cuda_drop_async(interesting_divisor, stream);
+    cuda_drop_async(divisor_ms_blocks, stream);
+    cuda_drop_async(new_remainder, stream);
+    cuda_drop_async(subtraction_overflowed, stream);
+    cuda_drop_async(did_not_overflow, stream);
+    cuda_drop_async(overflow_sum, stream);
+    cuda_drop_async(overflow_sum_radix, stream);
+    cuda_drop_async(tmp_1, stream);
+    cuda_drop_async(at_least_one_upper_block_is_non_zero, stream);
+    cuda_drop_async(cleaned_merged_interesting_remainder, stream);
+  }
+};
+
 template <typename Torus> struct int_bitop_buffer {

  int_radix_params params;
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -5,7 +5,7 @@
 #include <cstdint>

 enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
-enum PBS_VARIANT { DEFAULT = 0, CG = 1 };
+enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };

 extern "C" {
 void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
@@ -111,6 +111,28 @@ get_buffer_size_partial_sm_programmable_bootstrap(uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft
 }

+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_programmable_bootstrap_tbc(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_programmable_bootstrap_tbc(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // tbc
+}
+
 template <typename Torus>
 __host__ __device__ uint64_t
 get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
@@ -125,6 +147,11 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
 }

+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_classic_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory);
+
 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
@@ -213,6 +240,54 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
                polynomial_size / 2 * sizeof(double2),
            stream);
      } break;
+#if CUDA_ARCH >= 900
+      case PBS_VARIANT::TBC: {
+
+        bool supports_dsm =
+            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+                Torus>(polynomial_size, max_shared_memory);
+
+        uint64_t full_sm =
+            get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+                polynomial_size);
+        uint64_t minimum_sm_tbc = 0;
+        if (supports_dsm)
+          minimum_sm_tbc =
+              get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
+                  Torus>(polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
+        // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+        if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
+
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                polynomial_size / 2 * sizeof(double2),
+            stream);
+      } break;
+#endif
      default:
        PANIC("Cuda error (PBS): unsupported implementation variant.")
      }
@@ -281,6 +356,25 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
    uint32_t lwe_idx, uint32_t max_shared_memory);

+#if (CUDA_ARCH >= 900)
+template <typename Torus>
+void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_programmable_bootstrap_tbc(
+    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+#endif
+
 template <typename Torus, typename STorus>
 void scratch_cuda_programmable_bootstrap_cg(
    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
@@ -295,6 +389,13 @@ void scratch_cuda_programmable_bootstrap(
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);

+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
+                                                    uint32_t glwe_dimension,
+                                                    uint32_t polynomial_size,
+                                                    uint32_t level_count,
+                                                    uint32_t max_shared_memory);
+
 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
                                         int glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -51,6 +51,37 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
                                                   int8_t **pbs_buffer);
 }

+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory);
+
+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory);
+
+#if CUDA_ARCH >= 900
+template <typename Torus, typename STorus>
+void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
+    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+
+template <typename Torus>
+void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size);
+#endif
+
 template <typename Torus, typename STorus>
 void scratch_cuda_cg_multi_bit_programmable_bootstrap(
    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
@@ -113,12 +144,25 @@ template <typename Torus>
 __host__ __device__ uint64_t
 get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap(
    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size);

 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
  int8_t *d_mem_acc_step_two = NULL;
  int8_t *d_mem_acc_cg = NULL;
+  int8_t *d_mem_acc_tbc = NULL;

  double2 *keybundle_fft;
  Torus *global_accumulator;
@@ -133,18 +177,20 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
    this->pbs_variant = pbs_variant;
    auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);

+    // default
    uint64_t full_sm_keybundle =
        get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<
            Torus>(polynomial_size);
    uint64_t full_sm_accumulate_step_one =
        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
            polynomial_size);
-    uint64_t partial_sm_accumulate_step_one =
-        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
-            Torus>(polynomial_size);
    uint64_t full_sm_accumulate_step_two =
        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
            polynomial_size);
+    uint64_t partial_sm_accumulate_step_one =
+        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+            Torus>(polynomial_size);
+    // cg
    uint64_t full_sm_cg_accumulate =
        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
            polynomial_size);
@@ -162,6 +208,19 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
    auto num_blocks_acc_cg =
        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;

+#if CUDA_ARCH >= 900
+    uint64_t full_sm_tbc_accumulate =
+        get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    uint64_t partial_sm_tbc_accumulate =
+        get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    uint64_t minimum_sm_tbc =
+        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+    auto num_blocks_acc_tbc = num_blocks_acc_cg;
+#endif
+
    if (allocate_gpu_memory) {
      // Keybundle
      if (max_shared_memory < full_sm_keybundle)
@@ -169,7 +228,16 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
            num_blocks_keybundle * full_sm_keybundle, stream);

      switch (pbs_variant) {
-      case DEFAULT:
+      case PBS_VARIANT::CG:
+        // Accumulator CG
+        if (max_shared_memory < partial_sm_cg_accumulate)
+          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_cg * full_sm_cg_accumulate, stream);
+        else if (max_shared_memory < full_sm_cg_accumulate)
+          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
+        break;
+      case PBS_VARIANT::DEFAULT:
        // Accumulator step one
        if (max_shared_memory < partial_sm_accumulate_step_one)
          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
@@ -183,15 +251,27 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
          d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
              num_blocks_acc_step_two * full_sm_accumulate_step_two, stream);
        break;
-      case CG:
-        // Accumulator CG
-        if (max_shared_memory < partial_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * full_sm_cg_accumulate, stream);
-        else if (max_shared_memory < full_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * partial_sm_cg_accumulate, stream);
+#if CUDA_ARCH >= 900
+      case TBC:
+        // There is a minimum amount of memory we need to run the TBC PBS, which
+        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+        // because otherwise the previous check would have redirected
+        // computation to some other variant. If over that we don't have more
+        // partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
+        // partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
+        // will run on PARTIALSM. Otherwise, FULLSM.
+        //
+        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+
+        // Accumulator TBC
+        if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
+          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_tbc * full_sm_tbc_accumulate, stream);
+        else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
+          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
+              num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream);
        break;
+#endif
      default:
        PANIC("Cuda error (PBS): unsupported implementation variant.")
      }
@@ -200,7 +280,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
          num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
          stream);
      global_accumulator = (Torus *)cuda_malloc_async(
-          num_blocks_acc_step_two * polynomial_size * sizeof(Torus), stream);
+          num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream);
      global_accumulator_fft = (double2 *)cuda_malloc_async(
          num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
          stream);
@@ -222,6 +302,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
      if (d_mem_acc_cg)
        cuda_drop_async(d_mem_acc_cg, stream);
      break;
+#if CUDA_ARCH >= 900
+    case TBC:
+      if (d_mem_acc_tbc)
+        cuda_drop_async(d_mem_acc_tbc, stream);
+      break;
+#endif
    default:
      PANIC("Cuda error (PBS): unsupported implementation variant.")
    }
@@ -232,10 +318,9 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  }
 };

-#ifdef __CUDACC__
-
-__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count);
-
-#endif
+template <typename Torus, class params>
+__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
+                                     uint32_t polynomial_size,
+                                     uint32_t max_shared_memory);

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -71,6 +71,18 @@ bool cuda_check_support_cooperative_groups() {
  return cooperative_groups_supported > 0;
 }

+/// Returns
+///  false if Thread Block Cluster is not supported.
+///  true otherwise
+bool cuda_check_support_thread_block_clusters() {
+  // To-do: Is this really the best way to check support?
+  int tbc_supported = 0;
+  check_cuda_error(
+      cudaDeviceGetAttribute(&tbc_supported, cudaDevAttrClusterLaunch, 0));
+
+  return tbc_supported > 0;
+}
+
 /// Copy memory to the GPU asynchronously
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cuda_stream_t *stream) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -0,0 +1,83 @@
+#include "integer/div_rem.cuh"
+
+void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_div_rem_kb<uint64_t>(
+      stream, (int_div_rem_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      allocate_gpu_memory);
+}
+
+void cuda_integer_div_rem_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *quotient, void *remainder, void *numerator,
+    void *divisor, int8_t *mem_ptr, void *bsk, void *ksk, uint32_t num_blocks) {
+
+  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
+
+  switch (mem->params.polynomial_size) {
+  case 512:
+    host_integer_div_rem_kb<uint64_t, Degree<512>>(
+        stream, static_cast<uint64_t *>(quotient),
+        static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
+        static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
+        mem, num_blocks);
+    break;
+  case 1024:
+
+    host_integer_div_rem_kb<uint64_t, Degree<1024>>(
+        stream, static_cast<uint64_t *>(quotient),
+        static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
+        static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
+        mem, num_blocks);
+    break;
+  case 2048:
+    host_integer_div_rem_kb<uint64_t, Degree<2048>>(
+        stream, static_cast<uint64_t *>(quotient),
+        static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
+        static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
+        mem, num_blocks);
+    break;
+  case 4096:
+    host_integer_div_rem_kb<uint64_t, Degree<4096>>(
+        stream, static_cast<uint64_t *>(quotient),
+        static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
+        static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
+        mem, num_blocks);
+    break;
+  case 8192:
+    host_integer_div_rem_kb<uint64_t, Degree<8192>>(
+        stream, static_cast<uint64_t *>(quotient),
+        static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
+        static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
+        mem, num_blocks);
+    break;
+  case 16384:
+    host_integer_div_rem_kb<uint64_t, Degree<16384>>(
+        stream, static_cast<uint64_t *>(quotient),
+        static_cast<uint64_t *>(remainder), static_cast<uint64_t *>(numerator),
+        static_cast<uint64_t *>(divisor), bsk, static_cast<uint64_t *>(ksk),
+        mem, num_blocks);
+    break;
+  default:
+    PANIC("Cuda error (integer div_rem): unsupported polynomial size. "
+          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
+  }
+}
+
+void cleanup_cuda_integer_div_rem(cuda_stream_t *stream,
+                                  int8_t **mem_ptr_void) {
+  int_div_rem_memory<uint64_t> *mem_ptr =
+      (int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -0,0 +1,587 @@
+#ifndef TFHE_RS_DIV_REM_CUH
+#define TFHE_RS_DIV_REM_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/comparison.cuh"
+#include "integer/integer.cuh"
+#include "integer/negation.cuh"
+#include "integer/scalar_shifts.cuh"
+#include "linear_algebra.h"
+#include "programmable_bootstrap.h"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+int ceil_div(int a, int b) { return (a + b - 1) / b; }
+
+// struct makes it easier to use list of ciphertexts and move data between them
+// struct does not allocate or drop any memory,
+// keeps track on number of ciphertexts inside list.
+template <typename Torus> struct lwe_ciphertext_list {
+  Torus *data;
+  size_t max_blocks;
+  size_t len;
+  int_radix_params params;
+
+  size_t big_lwe_size;
+  size_t radix_size;
+  size_t big_lwe_size_bytes;
+  size_t radix_size_bytes;
+  size_t big_lwe_dimension;
+
+  lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
+      : data(src), params(params), max_blocks(max_blocks) {
+    big_lwe_size = params.big_lwe_dimension + 1;
+    big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+    radix_size = max_blocks * big_lwe_size;
+    radix_size_bytes = radix_size * sizeof(Torus);
+    big_lwe_dimension = params.big_lwe_dimension;
+    len = max_blocks;
+  }
+
+  // copies ciphertexts from Torus*, starting from `starting_block` including
+  // `finish_block`, does not change the value of self len
+  void copy_from(Torus *src, size_t start_block, size_t finish_block,
+                 cuda_stream_t *stream) {
+    size_t tmp_len = finish_block - start_block + 1;
+    cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
+                                 tmp_len * big_lwe_size_bytes, stream);
+  }
+
+  // copies ciphertexts from lwe_ciphertext_list, starting from `starting_block`
+  // including `finish_block`, does not change the value of self len
+  void copy_from(const lwe_ciphertext_list &src, size_t start_block,
+                 size_t finish_block, cuda_stream_t *stream) {
+    copy_from(src.data, start_block, finish_block, stream);
+  }
+
+  // copies ciphertexts from Torus*, starting from `starting_block`
+  // including `finish_block`, updating the value of self len
+  void clone_from(Torus *src, size_t start_block, size_t finish_block,
+                  cuda_stream_t *stream) {
+    len = finish_block - start_block + 1;
+
+    cuda_memcpy_async_gpu_to_gpu(data, &src[start_block * big_lwe_size],
+                                 len * big_lwe_size_bytes, stream);
+  }
+
+  // copies ciphertexts from ciphertexts_list, starting from `starting_block`
+  // including `finish_block`, updating the value of self len
+  void clone_from(const lwe_ciphertext_list &src, size_t start_block,
+                  size_t finish_block, cuda_stream_t *stream) {
+    clone_from(src.data, start_block, finish_block, stream);
+  }
+
+  // assign zero to blocks starting from `start_block` including `finish_block`
+  void assign_zero(size_t start_block, size_t finish_block,
+                   cuda_stream_t *stream) {
+    auto size = finish_block - start_block + 1;
+    cuda_memset_async(&data[start_block * big_lwe_size], 0,
+                      size * big_lwe_size_bytes, stream);
+  }
+
+  // return pointer to last block
+  Torus *last_block() { return &data[(len - 1) * big_lwe_size]; }
+
+  // return pointer to first_block
+  Torus *first_block() { return data; }
+
+  // return block with `index`
+  Torus *get_block(size_t index) {
+    assert(index < len);
+    return &data[index * big_lwe_size];
+  }
+
+  bool is_empty() { return len == 0; }
+
+  // does not dop actual memory from `data`, only reduces value of `len` by one
+  void pop() {
+    if (len > 0)
+      len--;
+    else
+      assert(len > 0);
+  }
+
+  // insert ciphertext at index `ind`
+  void insert(size_t ind, Torus *ciphertext_block, cuda_stream_t *stream) {
+    assert(ind <= len);
+    assert(len < max_blocks);
+
+    size_t insert_offset = ind * big_lwe_size;
+
+    for (size_t i = len; i > ind; i--) {
+      Torus *src = &data[(i - 1) * big_lwe_size];
+      Torus *dst = &data[i * big_lwe_size];
+      cuda_memcpy_async_gpu_to_gpu(dst, src, big_lwe_size_bytes, stream);
+    }
+
+    cuda_memcpy_async_gpu_to_gpu(&data[insert_offset], ciphertext_block,
+                                 big_lwe_size_bytes, stream);
+    len++;
+  }
+
+  // push ciphertext at the end of `data`
+  void push(Torus *ciphertext_block, cuda_stream_t *stream) {
+    assert(len < max_blocks);
+
+    size_t offset = len * big_lwe_size;
+    cuda_memcpy_async_gpu_to_gpu(&data[offset], ciphertext_block,
+                                 big_lwe_size_bytes, stream);
+    len++;
+  }
+
+  // duplicate ciphertext into `number_of_blocks` ciphertexts
+  void fill_with_same_ciphertext(Torus *ciphertext, size_t number_of_blocks,
+                                 cuda_stream_t *stream) {
+    assert(number_of_blocks <= max_blocks);
+
+    for (size_t i = 0; i < number_of_blocks; i++) {
+      Torus *dest = &data[i * big_lwe_size];
+      cuda_memcpy_async_gpu_to_gpu(dest, ciphertext, big_lwe_size_bytes,
+                                   stream);
+    }
+
+    len = number_of_blocks;
+  }
+
+  // used for debugging, prints body of each ciphertext.
+  void print_blocks_body(const char *name) {
+    for (int i = 0; i < len; i++) {
+      print_debug(name, &data[i * big_lwe_size + big_lwe_dimension], 1);
+    }
+  }
+};
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_div_rem_kb(
+    cuda_stream_t *stream, int_div_rem_memory<Torus> **mem_ptr,
+    uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
+
+  cudaSetDevice(stream->gpu_index);
+  *mem_ptr = new int_div_rem_memory<Torus>(stream, params, num_blocks,
+                                           allocate_gpu_memory);
+}
+
+template <typename Torus, class params>
+__host__ void host_integer_div_rem_kb(cuda_stream_t *stream, Torus *quotient,
+                                      Torus *remainder, Torus *numerator,
+                                      Torus *divisor, void *bsk, uint64_t *ksk,
+                                      int_div_rem_memory<uint64_t> *mem_ptr,
+                                      uint32_t num_blocks) {
+
+  auto radix_params = mem_ptr->params;
+
+  auto big_lwe_dimension = radix_params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  uint32_t message_modulus = radix_params.message_modulus;
+  uint32_t carry_modulus = radix_params.carry_modulus;
+  uint32_t num_bits_in_message = 31 - __builtin_clz(message_modulus);
+  uint32_t total_bits = num_bits_in_message * num_blocks;
+
+  // put temporary buffers in lwe_ciphertext_list for easy use
+  lwe_ciphertext_list<Torus> remainder1(mem_ptr->remainder1, radix_params,
+                                        num_blocks);
+  lwe_ciphertext_list<Torus> remainder2(mem_ptr->remainder2, radix_params,
+                                        num_blocks);
+  lwe_ciphertext_list<Torus> numerator_block_stack(
+      mem_ptr->numerator_block_stack, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> numerator_block_1(mem_ptr->numerator_block_1,
+                                               radix_params, 1);
+  lwe_ciphertext_list<Torus> tmp_radix(mem_ptr->tmp_radix, radix_params,
+                                       num_blocks + 1);
+  lwe_ciphertext_list<Torus> interesting_remainder1(
+      mem_ptr->interesting_remainder1, radix_params, num_blocks + 1);
+  lwe_ciphertext_list<Torus> interesting_remainder2(
+      mem_ptr->interesting_remainder2, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> interesting_divisor(mem_ptr->interesting_divisor,
+                                                 radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> divisor_ms_blocks(mem_ptr->divisor_ms_blocks,
+                                               radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> new_remainder(mem_ptr->new_remainder, radix_params,
+                                           num_blocks);
+  lwe_ciphertext_list<Torus> subtraction_overflowed(
+      mem_ptr->subtraction_overflowed, radix_params, 1);
+  lwe_ciphertext_list<Torus> did_not_overflow(mem_ptr->did_not_overflow,
+                                              radix_params, 1);
+  lwe_ciphertext_list<Torus> overflow_sum(mem_ptr->overflow_sum, radix_params,
+                                          1);
+  lwe_ciphertext_list<Torus> overflow_sum_radix(mem_ptr->overflow_sum_radix,
+                                                radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> tmp_1(mem_ptr->tmp_1, radix_params, num_blocks);
+  lwe_ciphertext_list<Torus> at_least_one_upper_block_is_non_zero(
+      mem_ptr->at_least_one_upper_block_is_non_zero, radix_params, 1);
+  lwe_ciphertext_list<Torus> cleaned_merged_interesting_remainder(
+      mem_ptr->cleaned_merged_interesting_remainder, radix_params, num_blocks);
+
+  numerator_block_stack.clone_from(numerator, 0, num_blocks - 1, stream);
+  remainder1.assign_zero(0, num_blocks - 1, stream);
+  remainder2.assign_zero(0, num_blocks - 1, stream);
+
+  cuda_memset_async(quotient, 0, big_lwe_size_bytes * num_blocks, stream);
+
+  for (int i = total_bits - 1; i >= 0; i--) {
+    uint32_t block_of_bit = i / num_bits_in_message;
+    uint32_t pos_in_block = i % num_bits_in_message;
+    uint32_t msb_bit_set = total_bits - 1 - i;
+    uint32_t last_non_trivial_block = msb_bit_set / num_bits_in_message;
+
+    // Index to the first block of the remainder that is fully trivial 0
+    // and all blocks after it are also trivial zeros
+    // This number is in range 1..=num_bocks -1
+    uint32_t first_trivial_block = last_non_trivial_block + 1;
+
+    interesting_remainder1.clone_from(remainder1, 0, last_non_trivial_block,
+                                      stream);
+    interesting_remainder2.clone_from(remainder2, 0, last_non_trivial_block,
+                                      stream);
+    interesting_divisor.clone_from(divisor, 0, last_non_trivial_block, stream);
+    divisor_ms_blocks.clone_from(divisor,
+                                 (msb_bit_set + 1) / num_bits_in_message,
+                                 num_blocks - 1, stream);
+
+    // We split the divisor at a block position, when in reality the split
+    // should be at a bit position meaning that potentially (depending on
+    // msb_bit_set) the split versions share some bits they should not. So we do
+    // one PBS on the last block of the interesting_divisor, and first block of
+    // divisor_ms_blocks to trim out bits which should not be there
+    auto trim_last_interesting_divisor_bits = [&](cuda_stream_t *stream) {
+      if ((msb_bit_set + 1) % num_bits_in_message == 0) {
+        return;
+      }
+      // The last block of the interesting part of the remainder
+      // can contain bits which we should not account for
+      // we have to zero them out.
+
+      // Where the msb is set in the block
+      uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+
+      // e.g 2 bits in message:
+      // if pos_in_block is 0, then we want to keep only first bit (right
+      // shift
+      // mask by 1) if pos_in_block is 1, then we want to keep the two
+      // bits
+      // (right shift mask by 0)
+      uint32_t shift_amount = num_bits_in_message - (pos_in_block + 1);
+
+      // Create mask of 1s on the message part, 0s in the carries
+      uint32_t full_message_mask = message_modulus - 1;
+
+      // Shift the mask so that we will only keep bits we should
+      uint32_t shifted_mask = full_message_mask >> shift_amount;
+
+      integer_radix_apply_univariate_lookup_table_kb(
+          stream, interesting_divisor.last_block(),
+          interesting_divisor.last_block(), bsk, ksk, 1,
+          mem_ptr->masking_luts_1[shifted_mask]);
+    }; // trim_last_interesting_divisor_bits
+
+    auto trim_first_divisor_ms_bits = [&](cuda_stream_t *stream) {
+      if (divisor_ms_blocks.is_empty() ||
+          ((msb_bit_set + 1) % num_bits_in_message) == 0) {
+        return;
+      }
+      // Where the msb is set in the block
+      uint32_t pos_in_block = msb_bit_set % num_bits_in_message;
+
+      // e.g 2 bits in message:
+      // if pos_in_block is 0, then we want to discard the first bit (left shift
+      // mask by 1) if pos_in_block is 1, then we want to discard the two bits
+      // (left shift mask by 2) let shift_amount = num_bits_in_message -
+      // pos_in_block
+      uint32_t shift_amount = pos_in_block + 1;
+      uint32_t full_message_mask = message_modulus - 1;
+      uint32_t shifted_mask = full_message_mask << shift_amount;
+
+      // Keep the mask within the range of message bits, so that
+      // the estimated degree of the output is < msg_modulus
+      shifted_mask = shifted_mask & full_message_mask;
+
+      integer_radix_apply_univariate_lookup_table_kb(
+          stream, divisor_ms_blocks.first_block(),
+          divisor_ms_blocks.first_block(), bsk, ksk, 1,
+          mem_ptr->masking_luts_2[shifted_mask]);
+    }; // trim_first_divisor_ms_bits
+
+    // This does
+    //  R := R << 1; R(0) := N(i)
+    //
+    // We could to that by left shifting, R by one, then unchecked_add the
+    // correct numerator bit.
+    //
+    // However, to keep the remainder clean (noise wise), what we do is that we
+    // put the remainder block from which we need to extract the bit, as the LSB
+    // of the Remainder, so that left shifting will pull the bit we need.
+    auto left_shift_interesting_remainder1 = [&](cuda_stream_t *stream3) {
+      numerator_block_1.clone_from(numerator_block_stack,
+                                   numerator_block_stack.len - 1,
+                                   numerator_block_stack.len - 1, stream3);
+      numerator_block_stack.pop();
+      interesting_remainder1.insert(0, numerator_block_1.first_block(),
+                                    stream3);
+
+      host_integer_radix_logical_scalar_shift_kb_inplace(
+          stream3, interesting_remainder1.data, 1, mem_ptr->shift_mem_1, bsk,
+          ksk, interesting_remainder1.len);
+
+      tmp_radix.clone_from(interesting_remainder1, 0,
+                           interesting_remainder1.len - 1, stream3);
+
+      radix_blocks_rotate_left<<<interesting_remainder1.len, 256, 0,
+                                 stream3->stream>>>(
+          interesting_remainder1.data, tmp_radix.data, 1,
+          interesting_remainder1.len, big_lwe_size);
+
+      numerator_block_1.clone_from(interesting_remainder1,
+                                   interesting_remainder1.len - 1,
+                                   interesting_remainder1.len - 1, stream3);
+
+      interesting_remainder1.pop();
+
+      if (pos_in_block != 0) {
+        // We have not yet extracted all the bits from this numerator
+        // so, we put it back on the front so that it gets taken next iteration
+        numerator_block_stack.push(numerator_block_1.first_block(), stream3);
+      }
+    }; // left_shift_interesting_remainder1
+
+    auto left_shift_interesting_remainder2 = [&](cuda_stream_t *stream4) {
+      host_integer_radix_logical_scalar_shift_kb_inplace(
+          stream4, interesting_remainder2.data, 1, mem_ptr->shift_mem_2, bsk,
+          ksk, interesting_remainder2.len);
+    }; // left_shift_interesting_remainder2
+
+    stream->synchronize();
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // interesting_divisor
+        trim_last_interesting_divisor_bits(mem_ptr->sub_stream_1);
+      }
+#pragma omp section
+      {
+        // divisor_ms_blocks
+        trim_first_divisor_ms_bits(mem_ptr->sub_stream_2);
+      }
+#pragma omp section
+      {
+        // interesting_remainder1
+        // numerator_block_stack
+        left_shift_interesting_remainder1(mem_ptr->sub_stream_3);
+      }
+#pragma omp section
+      {
+        // interesting_remainder2
+        left_shift_interesting_remainder2(mem_ptr->sub_stream_4);
+      }
+    }
+    cuda_synchronize_stream(mem_ptr->sub_stream_1);
+    cuda_synchronize_stream(mem_ptr->sub_stream_2);
+    cuda_synchronize_stream(mem_ptr->sub_stream_3);
+    cuda_synchronize_stream(mem_ptr->sub_stream_4);
+
+    // if interesting_remainder1 != 0 -> interesting_remainder2 == 0
+    // if interesting_remainder1 == 0 -> interesting_remainder2 != 0
+    // In practice interesting_remainder1 contains the numerator bit,
+    // but in that position, interesting_remainder2 always has a 0
+    auto &merged_interesting_remainder = interesting_remainder1;
+
+    host_addition(stream, merged_interesting_remainder.data,
+                  merged_interesting_remainder.data,
+                  interesting_remainder2.data, radix_params.big_lwe_dimension,
+                  merged_interesting_remainder.len);
+
+    // after create_clean_version_of_merged_remainder
+    // `merged_interesting_remainder` will be reused as
+    // `cleaned_merged_interesting_remainder`
+    cleaned_merged_interesting_remainder.clone_from(
+        merged_interesting_remainder, 0, merged_interesting_remainder.len - 1,
+        stream);
+
+    assert(merged_interesting_remainder.len == interesting_divisor.len);
+
+    // `new_remainder` is not initialized yet, so need to set length
+    new_remainder.len = merged_interesting_remainder.len;
+
+    // fills:
+    //  `new_remainder` - radix ciphertext
+    //  `subtraction_overflowed` - single ciphertext
+    auto do_overflowing_sub = [&](cuda_stream_t *stream) {
+      host_integer_overflowing_sub_kb<Torus, params>(
+          stream, new_remainder.data, subtraction_overflowed.data,
+          merged_interesting_remainder.data, interesting_divisor.data, bsk, ksk,
+          mem_ptr->overflow_sub_mem, merged_interesting_remainder.len);
+    };
+
+    // fills:
+    //  `at_least_one_upper_block_is_non_zero` - single ciphertext
+    auto check_divisor_upper_blocks = [&](cuda_stream_t *stream) {
+      auto &trivial_blocks = divisor_ms_blocks;
+      if (trivial_blocks.is_empty()) {
+        cuda_memset_async(at_least_one_upper_block_is_non_zero.first_block(), 0,
+                          big_lwe_size_bytes, stream);
+      } else {
+
+        // We could call unchecked_scalar_ne
+        // But we are in the special case where scalar == 0
+        // So we can skip some stuff
+        host_compare_with_zero_equality(
+            stream, tmp_1.data, trivial_blocks.data, mem_ptr->comparison_buffer,
+            bsk, ksk, trivial_blocks.len,
+            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
+
+        tmp_1.len =
+            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
+
+        is_at_least_one_comparisons_block_true(
+            stream, at_least_one_upper_block_is_non_zero.data, tmp_1.data,
+            mem_ptr->comparison_buffer, bsk, ksk, tmp_1.len);
+      }
+    };
+
+    // Creates a cleaned version (noise wise) of the merged remainder
+    // so that it can be safely used in bivariate PBSes
+    // fills:
+    //  `cleaned_merged_interesting_remainder` - radix ciphertext
+    auto create_clean_version_of_merged_remainder = [&](cuda_stream_t *stream) {
+      integer_radix_apply_univariate_lookup_table_kb(
+          stream, cleaned_merged_interesting_remainder.data,
+          cleaned_merged_interesting_remainder.data, bsk, ksk,
+          cleaned_merged_interesting_remainder.len,
+          mem_ptr->message_extract_lut_1);
+    };
+
+    // phase 2
+    stream->synchronize();
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // new_remainder
+        // subtraction_overflowed
+        do_overflowing_sub(mem_ptr->sub_stream_1);
+      }
+#pragma omp section
+      {
+        // at_least_one_upper_block_is_non_zero
+        check_divisor_upper_blocks(mem_ptr->sub_stream_2);
+      }
+#pragma omp section
+      {
+        // cleaned_merged_interesting_remainder
+        create_clean_version_of_merged_remainder(mem_ptr->sub_stream_3);
+      }
+    }
+    cuda_synchronize_stream(mem_ptr->sub_stream_1);
+    cuda_synchronize_stream(mem_ptr->sub_stream_2);
+    cuda_synchronize_stream(mem_ptr->sub_stream_3);
+
+    host_addition(stream, overflow_sum.data, subtraction_overflowed.data,
+                  at_least_one_upper_block_is_non_zero.data,
+                  radix_params.big_lwe_dimension, 1);
+
+    int factor = (i) ? 3 : 2;
+    int factor_lut_id = factor - 2;
+    overflow_sum_radix.fill_with_same_ciphertext(
+        overflow_sum.first_block(), cleaned_merged_interesting_remainder.len,
+        stream);
+
+    auto conditionally_zero_out_merged_interesting_remainder =
+        [&](cuda_stream_t *stream) {
+          integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
+              stream, cleaned_merged_interesting_remainder.data,
+              cleaned_merged_interesting_remainder.data,
+              overflow_sum_radix.data, bsk, ksk,
+              cleaned_merged_interesting_remainder.len,
+              mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
+              factor);
+        };
+
+    auto conditionally_zero_out_merged_new_remainder =
+        [&](cuda_stream_t *stream) {
+          integer_radix_apply_bivariate_lookup_table_kb_factor<Torus>(
+              stream, new_remainder.data, new_remainder.data,
+              overflow_sum_radix.data, bsk, ksk, new_remainder.len,
+              mem_ptr->zero_out_if_overflow_happened[factor_lut_id], factor);
+        };
+
+    auto set_quotient_bit = [&](cuda_stream_t *stream) {
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          stream, did_not_overflow.data, subtraction_overflowed.data,
+          at_least_one_upper_block_is_non_zero.data, bsk, ksk, 1,
+          mem_ptr->merge_overflow_flags_luts[pos_in_block]);
+
+      host_addition(stream, &quotient[block_of_bit * big_lwe_size],
+                    &quotient[block_of_bit * big_lwe_size],
+                    did_not_overflow.data, radix_params.big_lwe_dimension, 1);
+    };
+
+    stream->synchronize();
+#pragma omp parallel sections
+    {
+#pragma omp section
+      {
+        // cleaned_merged_interesting_remainder
+        conditionally_zero_out_merged_interesting_remainder(
+            mem_ptr->sub_stream_1);
+      }
+#pragma omp section
+      {
+        // new_remainder
+        conditionally_zero_out_merged_new_remainder(mem_ptr->sub_stream_2);
+      }
+#pragma omp section
+      {
+        // quotient
+        set_quotient_bit(mem_ptr->sub_stream_3);
+      }
+    }
+    cuda_synchronize_stream(mem_ptr->sub_stream_1);
+    cuda_synchronize_stream(mem_ptr->sub_stream_2);
+    cuda_synchronize_stream(mem_ptr->sub_stream_3);
+
+    assert(first_trivial_block - 1 == cleaned_merged_interesting_remainder.len);
+    assert(first_trivial_block - 1 == new_remainder.len);
+
+    remainder1.copy_from(cleaned_merged_interesting_remainder, 0,
+                         first_trivial_block - 1, stream);
+    remainder2.copy_from(new_remainder, 0, first_trivial_block - 1, stream);
+  }
+
+  assert(remainder1.len == remainder2.len);
+
+  // Clean the quotient and remainder
+  // as even though they have no carries, they are not at nominal noise level
+  host_addition(stream, remainder, remainder1.data, remainder2.data,
+                radix_params.big_lwe_dimension, remainder1.len);
+
+  stream->synchronize();
+#pragma omp parallel sections
+  {
+#pragma omp section
+    {
+      integer_radix_apply_univariate_lookup_table_kb(
+          mem_ptr->sub_stream_1, remainder, remainder, bsk, ksk, num_blocks,
+          mem_ptr->message_extract_lut_1);
+    }
+#pragma omp section
+    {
+      integer_radix_apply_univariate_lookup_table_kb(
+          mem_ptr->sub_stream_2, quotient, quotient, bsk, ksk, num_blocks,
+          mem_ptr->message_extract_lut_2);
+    }
+  }
+  mem_ptr->sub_stream_1->synchronize();
+  mem_ptr->sub_stream_2->synchronize();
+}
+
+#endif // TFHE_RS_DIV_REM_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -141,3 +141,41 @@ void cleanup_cuda_propagate_single_carry(cuda_stream_t *stream,
      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release(stream);
 }
+
+void scratch_cuda_apply_univariate_lut_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, void *input_lut,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
+    uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_apply_univariate_lut_kb<uint64_t>(
+      stream, (int_radix_lut<uint64_t> **)mem_ptr,
+      static_cast<uint64_t *>(input_lut), num_radix_blocks, params,
+      allocate_gpu_memory);
+}
+
+void cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
+                                     void *output_radix_lwe,
+                                     void *input_radix_lwe, int8_t *mem_ptr,
+                                     void *ksk, void *bsk,
+                                     uint32_t num_blocks) {
+
+  host_apply_univariate_lut_kb<uint64_t>(
+      stream, static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk), bsk,
+      num_blocks);
+}
+
+void cleanup_cuda_apply_univariate_lut_kb_64(cuda_stream_t *stream,
+                                             int8_t **mem_ptr_void) {
+  int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -172,6 +172,47 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
 }

+template <typename Torus>
+__host__ void integer_radix_apply_bivariate_lookup_table_kb_factor(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
+    Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
+    int_radix_lut<Torus> *lut, uint32_t shift) {
+  cudaSetDevice(stream->gpu_index);
+  // apply_lookup_table_bivariate
+  auto params = lut->params;
+  auto pbs_type = params.pbs_type;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
+  auto message_modulus = params.message_modulus;
+
+  // Left message is shifted
+  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
+  pack_bivariate_blocks(stream, lwe_array_pbs_in, lut->lwe_trivial_indexes,
+                        lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
+                        big_lwe_dimension, shift, num_radix_blocks);
+  check_cuda_error(cudaGetLastError());
+
+  // Apply LUT
+  cuda_keyswitch_lwe_ciphertext_vector(
+      stream, lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes, lwe_array_pbs_in,
+      lut->lwe_trivial_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
+      ks_base_log, ks_level, num_radix_blocks);
+
+  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes_out, lut->lut,
+                     lut->lut_indexes, lut->tmp_lwe_after_ks,
+                     lut->lwe_trivial_indexes, bsk, lut->buffer, glwe_dimension,
+                     small_lwe_dimension, polynomial_size, pbs_base_log,
+                     pbs_level, grouping_factor, num_radix_blocks, 1, 0,
+                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+}
+
 // Rotates the slice in-place such that the first mid elements of the slice move
 // to the end while the last array_length elements move to the front. After
 // calling rotate_left, the element previously at index mid will become the
@@ -235,6 +276,24 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
                               message_modulus, carry_modulus, wrapped_f);
 }

+template <typename Torus>
+void generate_lookup_table_bivariate_with_factor(
+    Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus, Torus)> f, int factor) {
+
+  Torus factor_u64 = factor;
+  auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
+    Torus lhs = (input / factor_u64) % message_modulus;
+    Torus rhs = (input % factor_u64) % message_modulus;
+
+    return f(lhs, rhs);
+  };
+
+  generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
+                               message_modulus, carry_modulus, wrapped_f);
+}
+
 /*
 *  generate bivariate accumulator for device pointer
 *    v_stream - cuda stream
@@ -266,7 +325,38 @@ void generate_device_accumulator_bivariate(
 }

 /*
- *  generate bivariate accumulator for device pointer
+ *  generate bivariate accumulator with factor scaling for device pointer
+ *    v_stream - cuda stream
+ *    acc - device pointer for bivariate accumulator
+ *    ...
+ *    f - wrapping function with two Torus inputs
+ */
+template <typename Torus>
+void generate_device_accumulator_bivariate_with_factor(
+    cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus, Torus)> f, int factor) {
+
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill bivariate accumulator
+  generate_lookup_table_bivariate_with_factor<Torus>(
+      h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
+      factor);
+
+  // copy host lut and lut_indexes to device
+  cuda_memcpy_async_to_gpu(
+      acc_bivariate, h_lut,
+      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
+
+  // Release memory when possible
+  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
+}
+
+/*
+ *  generate accumulator for device pointer
 *    v_stream - cuda stream
 *    acc - device pointer for accumulator
 *    ...
@@ -771,4 +861,31 @@ __host__ void reduce_signs(cuda_stream_t *stream, Torus *signs_array_out,
                                                   signs_a, bsk, ksk, 1, lut);
  }
 }
+
+template <typename Torus>
+void scratch_cuda_apply_univariate_lut_kb(cuda_stream_t *stream,
+                                          int_radix_lut<Torus> **mem_ptr,
+                                          Torus *input_lut,
+                                          uint32_t num_radix_blocks,
+                                          int_radix_params params,
+                                          bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_radix_lut<Torus>(stream, params, 1, num_radix_blocks,
+                                      allocate_gpu_memory);
+  cuda_memcpy_async_to_gpu((*mem_ptr)->lut, input_lut,
+                           (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(Torus),
+                           stream);
+}
+
+template <typename Torus>
+void host_apply_univariate_lut_kb(cuda_stream_t *stream, Torus *radix_lwe_out,
+                                  Torus *radix_lwe_in,
+                                  int_radix_lut<Torus> *mem, Torus *ksk,
+                                  void *bsk, uint32_t num_blocks) {
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, radix_lwe_out, radix_lwe_in, bsk, ksk, num_blocks, mem);
+}
+
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -79,14 +79,20 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
                          grouping_factor, message_modulus, carry_modulus);

  switch (polynomial_size) {
+  case 256:
+  case 512:
+  case 1024:
  case 2048:
+  case 4096:
+  case 8192:
+  case 16384:
    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
        stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
        allocate_gpu_memory);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Only N = 2048 is supported")
+          "Supported N's are powers of two in the interval [256..16384].")
  }
 }

@@ -128,6 +134,30 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
    uint32_t max_shared_memory) {

  switch (polynomial_size) {
+  case 256:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<256>>(
+        stream, static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsk,
+        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
+    break;
+  case 512:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<512>>(
+        stream, static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsk,
+        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
+    break;
+  case 1024:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<1024>>(
+        stream, static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsk,
+        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
+    break;
  case 2048:
    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
        stream, static_cast<uint64_t *>(radix_lwe_out),
@@ -136,9 +166,33 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
        num_blocks);
    break;
+  case 4096:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<4096>>(
+        stream, static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsk,
+        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
+    break;
+  case 8192:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<8192>>(
+        stream, static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsk,
+        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
+    break;
+  case 16384:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<16384>>(
+        stream, static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsk,
+        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
+    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Only N = 2048 is supported")
+          "Supported N's are powers of two in the interval [256..16384].")
  }
 }

@@ -225,8 +279,8 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
        num_radix_in_vec);
    break;
  default:
-    PANIC("Cuda error (integer sum ciphertexts): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 is supported")
+    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
+          "Supported N's are powers of two in the interval [256..16384].")
  }

  free(terms_degree);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -91,12 +91,15 @@ all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-template <typename Torus>
+template <typename Torus, sharedMemDegree SMD>
 __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
                                uint32_t chunk_size, uint32_t block_size,
                                uint32_t num_blocks) {

-  extern __shared__ Torus result[];
+  extern __shared__ int8_t sharedmem[];
+
+  Torus *result = (Torus *)sharedmem;
+
  size_t stride = blockDim.x;
  size_t chunk_id = blockIdx.x;
  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
@@ -106,6 +109,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
  size_t block_stride = blockIdx.y * block_size;
  auto dst_block = &dst_radix[block_stride];

+  if constexpr (SMD == NOSM)
+    result = dst_block;
+
  // init shared mem with first radix of chunk
  size_t tid = threadIdx.x;
  for (int i = tid; i < block_size; i += stride) {
@@ -121,9 +127,9 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
  }

  // put result from shared mem to global mem
-  for (int i = tid; i < block_size; i += stride) {
-    dst_block[i] = result[i];
-  }
+  if constexpr (SMD == FULLSM)
+    for (int i = tid; i < block_size; i += stride)
+      dst_block[i] = result[i];
 }

 template <typename Torus, class params>
@@ -181,11 +187,20 @@ __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(

  cudaSetDevice(stream->gpu_index);
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
+  if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
+    check_cuda_error(cudaFuncSetAttribute(
+        tree_add_chunks<Torus, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
+                           cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(
+        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
+    check_cuda_error(cudaGetLastError());
+  }
  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
      stream, params, num_blocks_in_radix, max_num_radix_in_vec,
      allocate_gpu_memory);
@@ -203,11 +218,6 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;

-  auto luts_message_carry = mem_ptr->luts_message_carry;
-
-  auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
-  auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
-
  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;

@@ -247,8 +257,16 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    dim3 add_grid(ch_amount, num_blocks, 1);
    size_t sm_size = big_lwe_size * sizeof(Torus);

-    tree_add_chunks<Torus><<<add_grid, 512, sm_size, stream->stream>>>(
-        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
+    if (sm_size < max_shared_memory)
+      tree_add_chunks<Torus, FULLSM>
+          <<<add_grid, 512, sm_size, stream->stream>>>(
+              new_blocks, old_blocks, min(r, chunk_size), big_lwe_size,
+              num_blocks);
+    else
+      tree_add_chunks<Torus, NOSM><<<add_grid, 512, 0, stream->stream>>>(
+          new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);
+
+    check_cuda_error(cudaGetLastError());

    size_t total_count = 0;
    size_t message_count = 0;
@@ -260,6 +278,34 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
        h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
        total_count, message_count, carry_count, sm_copy_count);

+    // create lut object for message and carry
+    // we allocate luts_message_carry in the host function (instead of scratch)
+    // to reduce average memory consumption
+    auto luts_message_carry =
+        new int_radix_lut<Torus>(stream, mem_ptr->params, 2, total_count, true);
+
+    auto message_acc = luts_message_carry->get_lut(0);
+    auto carry_acc = luts_message_carry->get_lut(1);
+
+    // define functions for each accumulator
+    auto lut_f_message = [message_modulus](Torus x) -> Torus {
+      return x % message_modulus;
+    };
+    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+      return x / message_modulus;
+    };
+
+    // generate accumulators
+    generate_device_accumulator<Torus>(stream, message_acc, glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, lut_f_message);
+    generate_device_accumulator<Torus>(stream, carry_acc, glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, lut_f_carry);
+
+    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
+    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
+
    size_t copy_size = total_count * sizeof(Torus);
    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size, stream);
    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size, stream);
@@ -272,6 +318,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
    smart_copy<<<sm_copy_count, 256, 0, stream->stream>>>(
        new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
        big_lwe_size);
+    check_cuda_error(cudaGetLastError());

    if (carry_count > 0)
      cuda_set_value_async<Torus>(
@@ -291,6 +338,8 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor, total_count,
        2, 0, max_shared_memory, mem_ptr->params.pbs_type);

+    luts_message_carry->release(stream);
+
    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
    int new_blocks_created = 2 * ch_amount * num_blocks;
    copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
@@ -386,6 +435,7 @@ __host__ void host_integer_mult_radix_kb(
  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
      vector_lsb_rhs, vector_msb_rhs, num_blocks);
+  check_cuda_error(cudaGetLastError());

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
@@ -401,6 +451,7 @@ __host__ void host_integer_mult_radix_kb(
                           vector_result_msb, glwe_dimension,
                           lsb_vector_block_count, msb_vector_block_count,
                           num_blocks);
+  check_cuda_error(cudaGetLastError());

  int terms_degree[2 * num_blocks * num_blocks];
  for (int i = 0; i < num_blocks * num_blocks; i++) {
@@ -427,11 +478,20 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
    bool allocate_gpu_memory) {
  cudaSetDevice(stream->gpu_index);
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<Torus>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<Torus>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
+  if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
+    check_cuda_error(cudaFuncSetAttribute(
+        tree_add_chunks<Torus, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
+                           cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(
+        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
+    check_cuda_error(cudaGetLastError());
+  }

  *mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
                                       allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -35,11 +35,20 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(

  cudaSetDevice(stream->gpu_index);
  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
-  check_cuda_error(cudaFuncSetAttribute(
-      tree_add_chunks<T>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-      sm_size));
-  cudaFuncSetCacheConfig(tree_add_chunks<T>, cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
+  if (sm_size < cuda_get_max_shared_memory(stream->gpu_index)) {
+    check_cuda_error(cudaFuncSetAttribute(
+        tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
+        sm_size));
+    cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
+                           cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(
+        cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
+    check_cuda_error(cudaGetLastError());
+  }

  *mem_ptr = new int_scalar_mul_buffer<T>(stream, params, num_radix_blocks,
                                          allocate_gpu_memory);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cu
@@ -1 +1,44 @@
 #include "programmable_bootstrap.cuh"
+
+template <>
+__device__ int get_this_block_rank(grid_group &group, bool support_dsm) {
+  return blockIdx.y;
+}
+
+template <>
+__device__ double2 *
+get_join_buffer_element(int level_id, int glwe_id, grid_group &group,
+                        double2 *global_memory_buffer, uint32_t polynomial_size,
+                        uint32_t glwe_dimension, bool support_dsm) {
+  double2 *buffer_slice =
+      global_memory_buffer +
+      (glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
+  return buffer_slice;
+}
+
+#if CUDA_ARCH >= 900
+template <>
+__device__ int get_this_block_rank(cluster_group &cluster, bool support_dsm) {
+  if (support_dsm)
+    return cluster.block_rank();
+  else
+    return blockIdx.y;
+}
+template <>
+__device__ double2 *
+get_join_buffer_element(int level_id, int glwe_id, cluster_group &cluster,
+                        double2 *global_memory_buffer, uint32_t polynomial_size,
+                        uint32_t glwe_dimension, bool support_dsm) {
+  double2 *buffer_slice;
+  if (support_dsm) {
+    extern __shared__ double2 smem[];
+    buffer_slice = cluster.map_shared_rank(
+        smem, glwe_id + level_id * (glwe_dimension + 1));
+  } else {
+    buffer_slice =
+        global_memory_buffer +
+        (glwe_id + level_id * (glwe_dimension + 1)) * polynomial_size / 2;
+  }
+  return buffer_slice;
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -1,8 +1,120 @@
-#include "../../include/device.h"
-#include "../../include/programmable_bootstrap.h"
-#include "../include/device.h"
-#include "programmable_bootstrap_classic.cuh"
-#include "programmable_bootstrap_multibit.cuh"
+#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
+#define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
+
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "programmable_bootstrap.h"
+#include "programmable_bootstrap_multibit.h"
+
+#include "cooperative_groups.h"
+
+using namespace cooperative_groups;
+namespace cg = cooperative_groups;
+
+template <typename G>
+__device__ int get_this_block_rank(G &group, bool support_dsm);
+
+template <typename G>
+__device__ double2 *
+get_join_buffer_element(int level_id, int glwe_id, G &group,
+                        double2 *global_memory_buffer, uint32_t polynomial_size,
+                        uint32_t glwe_dimension, bool support_dsm);
+
+template <typename Torus, typename G, class params>
+__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
+                              double2 *join_buffer, double2 *bootstrapping_key,
+                              int polynomial_size, uint32_t glwe_dimension,
+                              int level_count, int iteration, G &group,
+                              bool support_dsm = false) {
+
+  // Switch to the FFT space
+  NSMFFT_direct<HalfDegree<params>>(fft);
+  synchronize_threads_in_block();
+
+  // Get the pieces of the bootstrapping key that will be needed for the
+  // external product; blockIdx.x is the ID of the block that's executing
+  // this function, so we end up getting the lines of the bootstrapping key
+  // needed to perform the external product in this block (corresponding to
+  // the same decomposition level)
+  auto bsk_slice = get_ith_mask_kth_block(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
+      glwe_dimension, level_count);
+
+  // Perform the matrix multiplication between the GGSW and the GLWE,
+  // each block operating on a single level for mask and body
+
+  // The first product is used to initialize level_join_buffer
+  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
+  auto buffer_slice =
+      get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
+                                 polynomial_size, glwe_dimension, support_dsm);
+
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
+    tid += params::degree / params::opt;
+  }
+
+  group.sync();
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  for (int j = 1; j < (glwe_dimension + 1); j++) {
+    int idx = (j + this_block_rank) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * params::degree / 2;
+    auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
+                                                   join_buffer, polynomial_size,
+                                                   glwe_dimension, support_dsm);
+
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt / 2; i++) {
+      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
+      tid += params::degree / params::opt;
+    }
+    group.sync();
+  }
+
+  // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block
+
+  auto src_acc =
+      get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
+                                 polynomial_size, glwe_dimension, support_dsm);
+
+  // copy first product into fft buffer
+  tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    fft[tid] = src_acc[tid];
+    tid += params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+
+  // accumulate rest of the products into fft buffer
+  for (int l = 1; l < gridDim.x; l++) {
+    auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
+                                                  join_buffer, polynomial_size,
+                                                  glwe_dimension, support_dsm);
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt / 2; i++) {
+      fft[tid] += cur_src_acc[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+
+  synchronize_threads_in_block();
+
+  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
+  // accumulator
+  NSMFFT_inverse<HalfDegree<params>>(fft);
+  synchronize_threads_in_block();
+
+  add_to_torus<Torus, params>(fft, accumulator);
+
+  __syncthreads();
+}

 template <typename Torus>
 void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
@@ -112,3 +224,5 @@ void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
          "moduli are supported.")
  }
 }
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -67,8 +67,7 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
    break;
  default:
    PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
-          "N's are powers of two"
-          " in the interval [256..16384].")
+          "N's are powers of two in the interval [256..16384].")
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -7,7 +7,6 @@
 #endif

 #include "cooperative_groups.h"
-
 #include "crypto/gadget.cuh"
 #include "crypto/torus.cuh"
 #include "device.h"
@@ -15,104 +14,13 @@
 #include "fft/twiddles.cuh"
 #include "polynomial/parameters.cuh"
 #include "polynomial/polynomial_math.cuh"
+#include "programmable_bootstrap.cuh"
 #include "programmable_bootstrap.h"
 #include "types/complex/operations.cuh"

-// Cooperative groups are used for this implementation
 using namespace cooperative_groups;
 namespace cg = cooperative_groups;

-template <typename Torus, class params>
-__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
-                              double2 *join_buffer, double2 *bootstrapping_key,
-                              int polynomial_size, uint32_t glwe_dimension,
-                              int level_count, int iteration,
-                              grid_group &grid) {
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  // Get the pieces of the bootstrapping key that will be needed for the
-  // external product; blockIdx.x is the ID of the block that's executing
-  // this function, so we end up getting the lines of the bootstrapping key
-  // needed to perform the external product in this block (corresponding to
-  // the same decomposition level)
-  auto bsk_slice = get_ith_mask_kth_block(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
-      glwe_dimension, level_count);
-
-  // Selects all GLWEs in a particular decomposition level
-  auto level_join_buffer =
-      join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
-
-  // Perform the matrix multiplication between the GGSW and the GLWE,
-  // each block operating on a single level for mask and body
-
-  // The first product is used to initialize level_join_buffer
-  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
-  auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
-
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
-    tid += params::degree / params::opt;
-  }
-
-  grid.sync();
-
-  // Continues multiplying fft by every polynomial in that particular bsk level
-  // Each y-block accumulates in a different polynomial at each iteration
-  for (int j = 1; j < (glwe_dimension + 1); j++) {
-    int idx = (j + blockIdx.y) % (glwe_dimension + 1);
-
-    auto bsk_poly = bsk_slice + idx * params::degree / 2;
-    auto buffer_slice = level_join_buffer + idx * params::degree / 2;
-
-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
-      tid += params::degree / params::opt;
-    }
-    grid.sync();
-  }
-
-  // -----------------------------------------------------------------
-  // All blocks are synchronized here; after this sync, level_join_buffer has
-  // the values needed from every other block
-
-  auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
-
-  // copy first product into fft buffer
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = src_acc[tid];
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // accumulate rest of the products into fft buffer
-  for (int l = 1; l < gridDim.x; l++) {
-    auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] += cur_src_acc[tid];
-      tid += params::degree / params::opt;
-    }
-  }
-
-  synchronize_threads_in_block();
-
-  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
-  // accumulator
-  NSMFFT_inverse<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  add_to_torus<Torus, params>(fft, accumulator);
-
-  __syncthreads();
-}
-
 /*
 * Kernel that computes the classical PBS using cooperative groups
 *
@@ -222,7 +130,7 @@ __global__ void device_programmable_bootstrap_cg(
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, params>(
+    mul_ggsw_glwe<Torus, grid_group, params>(
        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
        polynomial_size, glwe_dimension, level_count, i, grid);

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -1,5 +1,5 @@
-#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
-#define CUDA_FAST_MULTIBIT_PBS_CUH
+#ifndef CUDA_CG_MULTIBIT_PBS_CUH
+#define CUDA_CG_MULTIBIT_PBS_CUH

 #include "cooperative_groups.h"
 #include "crypto/gadget.cuh"
@@ -11,6 +11,7 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/parameters.cuh"
 #include "polynomial/polynomial_math.cuh"
+#include "programmable_bootstrap.cuh"
 #include "programmable_bootstrap.h"
 #include "programmable_bootstrap_multibit.cuh"
 #include "types/complex/operations.cuh"
@@ -106,9 +107,9 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
    synchronize_threads_in_block();

    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
-                                 block_join_buffer, keybundle, polynomial_size,
-                                 glwe_dimension, level_count, i, grid);
+    mul_ggsw_glwe<Torus, grid_group, params>(
+        accumulator, accumulator_fft, block_join_buffer, keybundle,
+        polynomial_size, glwe_dimension, level_count, i, grid);

    synchronize_threads_in_block();
  }
@@ -240,7 +241,9 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
  }

  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
+        max_shared_memory);
  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
      stream, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
@@ -248,7 +251,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
 }

 template <typename Torus, class params>
-__host__ void execute_external_product_loop(
+__host__ void execute_cg_external_product_loop(
    cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
    Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
@@ -336,7 +339,8 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
  cudaSetDevice(stream->gpu_index);

  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size(num_samples);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        stream->gpu_index, num_samples, polynomial_size, max_shared_memory);

  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {
@@ -349,7 +353,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
        lwe_chunk_size, lwe_offset);

    // Accumulate
-    execute_external_product_loop<Torus, params>(
+    execute_cg_external_product_loop<Torus, params>(
        stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
        lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -1,5 +1,8 @@
 #include "programmable_bootstrap_cg_classic.cuh"
 #include "programmable_bootstrap_classic.cuh"
+#if (CUDA_ARCH >= 900)
+#include "programmable_bootstrap_tbc_classic.cuh"
+#endif

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
@@ -12,6 +15,176 @@ bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
      max_shared_memory);
 }

+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory) {
+#if CUDA_ARCH >= 900
+  switch (polynomial_size) {
+  case 256:
+    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
+        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
+  case 512:
+    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
+        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
+  case 1024:
+    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
+        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 2048:
+    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
+        Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 4096:
+    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
+        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 8192:
+    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
+        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 16384:
+    return supports_thread_block_clusters_on_classic_programmable_bootstrap<
+        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
+                                       polynomial_size, level_count,
+                                       max_shared_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+#else
+  return false;
+#endif
+}
+
+#if (CUDA_ARCH >= 900)
+template <typename Torus, typename STorus>
+void scratch_cuda_programmable_bootstrap_tbc(
+    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<256>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 512:
+    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<512>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<1024>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<2048>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<4096>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 8192:
+    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<8192>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 16384:
+    scratch_programmable_bootstrap_tbc<Torus, STorus, AmortizedDegree<16384>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  default:
+    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+template <typename Torus>
+void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
+        level_count, num_samples, num_luts, max_shared_memory);
+    break;
+  case 512:
+    host_programmable_bootstrap_tbc<Torus, Degree<512>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
+        level_count, num_samples, num_luts, max_shared_memory);
+    break;
+  case 1024:
+    host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
+        level_count, num_samples, num_luts, max_shared_memory);
+    break;
+  case 2048:
+    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
+        level_count, num_samples, num_luts, max_shared_memory);
+    break;
+  case 4096:
+    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
+        level_count, num_samples, num_luts, max_shared_memory);
+    break;
+  case 8192:
+    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
+        level_count, num_samples, num_luts, max_shared_memory);
+    break;
+  case 16384:
+    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log,
+        level_count, num_samples, num_luts, max_shared_memory);
+    break;
+  default:
+    PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+#endif
+
 /*
 * Returns the buffer size for 64 bits executions
 */
@@ -143,9 +316,19 @@ void scratch_cuda_programmable_bootstrap_32(
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {

-  if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory))
+#if (CUDA_ARCH >= 900)
+  if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
+          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
+          level_count, max_shared_memory))
+    scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
+        stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        max_shared_memory, allocate_gpu_memory);
+  else
+#endif
+      if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
+              glwe_dimension, polynomial_size, level_count,
+              input_lwe_ciphertext_count, max_shared_memory))
    scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
        stream, (pbs_buffer<uint32_t, CLASSICAL> **)buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -168,9 +351,19 @@ void scratch_cuda_programmable_bootstrap_64(
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {

-  if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory))
+#if (CUDA_ARCH >= 900)
+  if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
+          level_count, max_shared_memory))
+    scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
+        stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        max_shared_memory, allocate_gpu_memory);
+  else
+#endif
+      if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
+              glwe_dimension, polynomial_size, level_count,
+              input_lwe_ciphertext_count, max_shared_memory))
    scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
        stream, (pbs_buffer<uint64_t, CLASSICAL> **)buffer, glwe_dimension,
        polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -321,7 +514,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
@@ -330,9 +523,28 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
          "in the ciphertext representation (32)");

-  if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
-          glwe_dimension, polynomial_size, level_count, num_samples,
-          max_shared_memory))
+  pbs_buffer<uint64_t, CLASSICAL> *buffer =
+      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
+
+  switch (buffer->pbs_variant) {
+  case TBC:
+#if CUDA_ARCH >= 900
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
+        stream, static_cast<uint32_t *>(lwe_array_out),
+        static_cast<uint32_t *>(lwe_output_indexes),
+        static_cast<uint32_t *>(lut_vector),
+        static_cast<uint32_t *>(lut_vector_indexes),
+        static_cast<uint32_t *>(lwe_array_in),
+        static_cast<uint32_t *>(lwe_input_indexes),
+        static_cast<double2 *>(bootstrapping_key),
+        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_luts, lwe_idx, max_shared_memory);
+#else
+    PANIC("Cuda error (PBS): TBC pbs is not supported.")
+#endif
+    break;
+  case CG:
    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
        stream, static_cast<uint32_t *>(lwe_array_out),
        static_cast<uint32_t *>(lwe_output_indexes),
@@ -344,7 +556,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
        num_luts, lwe_idx, max_shared_memory);
-  else
+    break;
+  case DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
        stream, static_cast<uint32_t *>(lwe_array_out),
        static_cast<uint32_t *>(lwe_output_indexes),
@@ -356,6 +569,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
        num_luts, lwe_idx, max_shared_memory);
+    break;
+  default:
+    PANIC("Cuda error (PBS): unknown pbs variant.")
+  }
 }

 /* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
@@ -433,7 +650,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
@@ -441,9 +658,28 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    PANIC("Cuda error (classical PBS): base log should be > number of bits "
          "in the ciphertext representation (64)");

-  if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
-          glwe_dimension, polynomial_size, level_count, num_samples,
-          max_shared_memory))
+  pbs_buffer<uint64_t, CLASSICAL> *buffer =
+      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
+
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::TBC:
+#if (CUDA_ARCH >= 900)
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_output_indexes),
+        static_cast<uint64_t *>(lut_vector),
+        static_cast<uint64_t *>(lut_vector_indexes),
+        static_cast<uint64_t *>(lwe_array_in),
+        static_cast<uint64_t *>(lwe_input_indexes),
+        static_cast<double2 *>(bootstrapping_key),
+        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
+        num_luts, lwe_idx, max_shared_memory);
+#else
+    PANIC("Cuda error (PBS): TBC pbs is not supported.")
+#endif
+    break;
+  case PBS_VARIANT::CG:
    cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
        stream, static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_output_indexes),
@@ -455,7 +691,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
        num_luts, lwe_idx, max_shared_memory);
-  else
+    break;
+  case PBS_VARIANT::DEFAULT:
    cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
        stream, static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_output_indexes),
@@ -467,6 +704,10 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
        num_luts, lwe_idx, max_shared_memory);
+    break;
+  default:
+    PANIC("Cuda error (PBS): unknown pbs variant.")
+  }
 }

 /*
@@ -546,3 +787,41 @@ template void scratch_cuda_programmable_bootstrap<uint32_t, int32_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);
+
+template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory);
+template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory);
+
+#if CUDA_ARCH >= 900
+template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
+    cuda_stream_t *stream, uint32_t *lwe_array_out,
+    uint32_t *lwe_output_indexes, uint32_t *lut_vector,
+    uint32_t *lut_vector_indexes, uint32_t *lwe_array_in,
+    uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
+    cuda_stream_t *stream, uint64_t *lwe_array_out,
+    uint64_t *lwe_output_indexes, uint64_t *lut_vector,
+    uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
+    uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
+    cuda_stream_t *stream, pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+template void scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
+    cuda_stream_t *stream, pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -3,6 +3,10 @@
 #include "programmable_bootstrap_multibit.cuh"
 #include "programmable_bootstrap_multibit.h"

+#if (CUDA_ARCH >= 900)
+#include "programmable_bootstrap_tbc_multibit.cuh"
+#endif
+
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t num_samples, uint32_t max_shared_memory) {
@@ -11,6 +15,57 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
                max_shared_memory);
 }

+template <typename Torus>
+bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory) {
+#if CUDA_ARCH >= 900
+  switch (polynomial_size) {
+  case 256:
+    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
+        Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
+  case 512:
+    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
+        Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
+                                     polynomial_size, level_count,
+                                     max_shared_memory);
+  case 1024:
+    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
+        Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 2048:
+    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
+        Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 4096:
+    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
+        Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 8192:
+    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
+        Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
+                                      polynomial_size, level_count,
+                                      max_shared_memory);
+  case 16384:
+    return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
+        Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
+                                       polynomial_size, level_count,
+                                       max_shared_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+#else
+  return false;
+#endif
+}
+
 template <typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
@@ -185,15 +240,34 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {

-  if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
-          glwe_dimension, polynomial_size, level_count, num_samples,
-          max_shared_memory))
+  pbs_buffer<uint64_t, MULTI_BIT> *buffer =
+      (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
+
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::TBC:
+#if CUDA_ARCH >= 900
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_output_indexes),
+        static_cast<uint64_t *>(lut_vector),
+        static_cast<uint64_t *>(lut_vector_indexes),
+        static_cast<uint64_t *>(lwe_array_in),
+        static_cast<uint64_t *>(lwe_input_indexes),
+        static_cast<uint64_t *>(bootstrapping_key),
+        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+#else
+    PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
+#endif
+    break;
+  case PBS_VARIANT::CG:
    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
        stream, static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_output_indexes),
@@ -201,11 +275,11 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lut_vector_indexes),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<uint64_t *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
-  else
+    break;
+  case PBS_VARIANT::DEFAULT:
    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
        stream, static_cast<uint64_t *>(lwe_array_out),
        static_cast<uint64_t *>(lwe_output_indexes),
@@ -213,10 +287,13 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<uint64_t *>(lut_vector_indexes),
        static_cast<uint64_t *>(lwe_array_in),
        static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<uint64_t *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
 }

 template <typename Torus, typename STorus>
@@ -356,9 +433,20 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
    uint32_t max_shared_memory, bool allocate_gpu_memory,
    uint32_t lwe_chunk_size) {

-  if (supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory))
+#if (CUDA_ARCH >= 900)
+  if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
+          level_count, max_shared_memory))
+    scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+        stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, level_count, grouping_factor,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
+        lwe_chunk_size);
+  else
+#endif
+      if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
+              uint64_t>(glwe_dimension, polynomial_size, level_count,
+                        input_lwe_ciphertext_count, max_shared_memory))
    scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
        stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
        glwe_dimension, polynomial_size, level_count, grouping_factor,
@@ -378,25 +466,69 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(cuda_stream_t *stream,
  x->release(stream);
 }

-// Returns a chunk size that is not optimal but close to
-__host__ uint32_t get_lwe_chunk_size(uint32_t ct_count) {
+/**
+ * Computes divisors of the product of num_sms (streaming multiprocessors on the
+ * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
+ * device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
+ * root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
+ * suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
+ * - 13) for additional logic.
+ *
+ * The value 13 was empirically determined based on memory requirements for
+ * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
+ */
+template <typename Torus, class params>
+__host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
+                                     uint32_t polynomial_size,
+                                     uint32_t max_shared_memory) {

-#if CUDA_ARCH >= 900
-  // Tesla H100
-  return (ct_count > 10000) ? 30 : 64;
-#elif CUDA_ARCH >= 890
-  // Tesla RTX4090
-  return 8;
-#elif CUDA_ARCH >= 800
-  // Tesla A100
-  return (ct_count > 10000) ? 30 : 45;
-#elif CUDA_ARCH >= 700
-  // Tesla V100
-  return (ct_count > 10000) ? 12 : 18;
-#else
-  // Generic case
-  return (ct_count > 10000) ? 2 : 1;
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
+          polynomial_size);
+
+  int max_blocks_per_sm;
+  if (max_shared_memory < full_sm_keybundle)
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        polynomial_size / params::opt, full_sm_keybundle);
+  else
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        polynomial_size / params::opt, 0);
+
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  int x = num_sms * max_blocks_per_sm;
+  int count = 0;
+
+  int divisor = 1;
+  int ith_divisor = 0;
+
+#if CUDA_ARCH < 900
+  // We pick a smaller divisor on GPUs other than H100, so 256-bit integer
+  // multiplication can run
+  int log2_max_num_pbs = std::log2(max_num_pbs);
+  if (log2_max_num_pbs > 13)
+    ith_divisor = log2_max_num_pbs - 11;
 #endif
+
+  for (int i = sqrt(x); i >= 1; i--) {
+    if (x % i == 0) {
+      if (count == ith_divisor) {
+        divisor = i;
+        break;
+      } else {
+        count++;
+      }
+    }
+  }
+
+  return divisor;
 }

 template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
@@ -437,3 +569,180 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
    uint32_t lwe_chunk_size);
+
+template bool
+has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory);
+
+#if (CUDA_ARCH >= 900)
+template <typename Torus, typename STorus>
+void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
+    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
+                                                 AmortizedDegree<256>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 512:
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
+                                                 AmortizedDegree<512>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 1024:
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
+                                                 AmortizedDegree<1024>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 2048:
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
+                                                 AmortizedDegree<2048>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 4096:
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
+                                                 AmortizedDegree<4096>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 8192:
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
+                                                 AmortizedDegree<8192>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 16384:
+    scratch_tbc_multi_bit_programmable_bootstrap<Torus, STorus,
+                                                 AmortizedDegree<16384>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+template <typename Torus>
+void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size) {
+
+  if (base_log > 64)
+    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
+          "the ciphertext representation (64)");
+
+  switch (polynomial_size) {
+  case 256:
+    host_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t,
+                                              AmortizedDegree<256>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 512:
+    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
+                                              AmortizedDegree<512>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 1024:
+    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
+                                              AmortizedDegree<1024>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 2048:
+    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
+                                              AmortizedDegree<2048>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 4096:
+    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
+                                              AmortizedDegree<4096>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 8192:
+    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
+                                              AmortizedDegree<8192>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 16384:
+    host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
+                                              AmortizedDegree<16384>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+template void
+scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+    cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+
+template void
+cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
+    cuda_stream_t *stream, uint64_t *lwe_array_out,
+    uint64_t *lwe_output_indexes, uint64_t *lut_vector,
+    uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
+    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
+    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size);
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -227,8 +227,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
  // decomposition, for the mask and the body (so block 0 will have the
  // accumulator decomposed at level 0, 1 at 1, etc.)
  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
-  gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
-                                                    blockIdx.x);
+  gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

  // We are using the same memory space for accumulator_fft and
  // accumulator_rotated, so we need to synchronize here to make sure they
@@ -469,7 +468,9 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
  }

  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size(input_lwe_ciphertext_count);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
+        max_shared_memory);
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
@@ -630,7 +631,8 @@ __host__ void host_multi_bit_programmable_bootstrap(

  // If a chunk size is not passed to this function, select one.
  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size(num_samples);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        stream->gpu_index, num_samples, polynomial_size, max_shared_memory);

  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -0,0 +1,424 @@
+#ifndef CUDA_TBC_PBS_CUH
+#define CUDA_TBC_PBS_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "cooperative_groups.h"
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "programmable_bootstrap.cuh"
+#include "programmable_bootstrap.h"
+#include "types/complex/operations.cuh"
+
+using namespace cooperative_groups;
+namespace cg = cooperative_groups;
+
+/*
+ * Kernel that computes the classical PBS using cooperative groups
+ *
+ * - lwe_array_out: vector of output lwe s, with length
+ * (glwe_dimension * polynomial_size+1)*num_samples
+ * - lut_vector: vector of look up tables with
+ * length  (glwe_dimension+1) * polynomial_size * num_samples
+ * - lut_vector_indexes: mapping between lwe_array_in and lut_vector
+ * lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
+ * num_samples
+ *
+ * Each y-block computes one element of the lwe_array_out.
+ */
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void device_programmable_bootstrap_tbc(
+    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    int8_t *device_mem, uint64_t device_memory_size_per_block,
+    bool support_dsm) {
+
+  cluster_group cluster = this_cluster();
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+    if (support_dsm)
+      selected_memory += sizeof(Torus) * polynomial_size;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated =
+      (Torus *)accumulator + (ptrdiff_t)polynomial_size;
+  double2 *accumulator_fft =
+      (double2 *)accumulator_rotated +
+      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
+
+  if constexpr (SMD == PARTIALSM) {
+    accumulator_fft = (double2 *)sharedmem;
+    if (support_dsm)
+      accumulator_fft += (ptrdiff_t)(polynomial_size / 2);
+  }
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
+
+  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
+                                        params::degree * (glwe_dimension + 1)];
+
+  double2 *block_join_buffer =
+      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
+                   params::degree / 2];
+  // Since the space is L1 cache is small, we use the same memory location for
+  // the rotated accumulator and the fft accumulator, since we know that the
+  // rotated array is not in use anymore by the time we perform the fft
+
+  // Put "b" in [0, 2N[
+  Torus b_hat = 0;
+  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                        2 * params::degree);
+
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                        params::degree / params::opt>(
+      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+      false);
+
+  for (int i = 0; i < lwe_dimension; i++) {
+    synchronize_threads_in_block();
+
+    // Put "a" in [0, 2N[
+    Torus a_hat = 0;
+    rescale_torus_element(block_lwe_array_in[i], a_hat,
+                          2 * params::degree); // 2 * params::log2_degree + 1);
+
+    // Perform ACC * (X^ä - 1)
+    multiply_by_monomial_negacyclic_and_sub_polynomial<
+        Torus, params::opt, params::degree / params::opt>(
+        accumulator, accumulator_rotated, a_hat);
+
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    round_to_closest_multiple_inplace<Torus, params::opt,
+                                      params::degree / params::opt>(
+        accumulator_rotated, base_log, level_count);
+
+    synchronize_threads_in_block();
+
+    // Decompose the accumulator. Each block gets one level of the
+    // decomposition, for the mask and the body (so block 0 will have the
+    // accumulator decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
+                                           accumulator_rotated);
+    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
+
+    // We are using the same memory space for accumulator_fft and
+    // accumulator_rotated, so we need to synchronize here to make sure they
+    // don't modify the same memory space at the same time
+    synchronize_threads_in_block();
+
+    // Perform G^-1(ACC) * GGSW -> GLWE
+    mul_ggsw_glwe<Torus, cluster_group, params>(
+        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
+        polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
+
+    synchronize_threads_in_block();
+  }
+
+  auto block_lwe_array_out =
+      &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                         (glwe_dimension * polynomial_size + 1) +
+                     blockIdx.y * polynomial_size];
+
+  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
+    // Perform a sample extract. At this point, all blocks have the result, but
+    // we do the computation at block 0 to avoid waiting for extra blocks, in
+    // case they're not synchronized
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+  }
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_programmable_bootstrap_tbc(
+    cuda_stream_t *stream, pbs_buffer<Torus, CLASSICAL> **buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  bool supports_dsm =
+      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+          Torus>(polynomial_size, max_shared_memory);
+
+  uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+      polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+          polynomial_size);
+  uint64_t minimum_sm_tbc = 0;
+  if (supports_dsm)
+    minimum_sm_tbc =
+        get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+  if (max_shared_memory >= full_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        partial_sm + minimum_sm_tbc));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
+    cudaFuncSetCacheConfig(
+        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  *buffer = new pbs_buffer<Torus, CLASSICAL>(
+      stream, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory);
+}
+
+/*
+ * Host wrapper
+ */
+template <typename Torus, class params>
+__host__ void host_programmable_bootstrap_tbc(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t num_luts, uint32_t max_shared_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  auto supports_dsm =
+      supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+          Torus>(polynomial_size, max_shared_memory);
+
+  // With SM each block corresponds to either the mask or body, no need to
+  // duplicate data for each
+  uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+      polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+          polynomial_size);
+  uint64_t minimum_sm_tbc = 0;
+  if (supports_dsm)
+    minimum_sm_tbc =
+        get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+  uint64_t full_dm = full_sm;
+
+  uint64_t partial_dm = full_dm - partial_sm;
+
+  int8_t *d_mem = buffer->d_mem;
+  double2 *buffer_fft = buffer->global_accumulator_fft;
+
+  int thds = polynomial_size / params::opt;
+  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
+
+  cudaLaunchConfig_t config = {0};
+  // The grid dimension is not affected by cluster launch, and is still
+  // enumerated using number of blocks. The grid dimension should be a multiple
+  // of cluster size.
+  config.gridDim = grid;
+  config.blockDim = thds;
+
+  cudaLaunchAttribute attribute[1];
+  attribute[0].id = cudaLaunchAttributeClusterDimension;
+  attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension
+  attribute[0].val.clusterDim.y = (glwe_dimension + 1);
+  attribute[0].val.clusterDim.z = 1;
+  config.attrs = attribute;
+  config.numAttrs = 1;
+  config.stream = stream->stream;
+
+  if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+    config.dynamicSmemBytes = minimum_sm_tbc;
+
+    check_cuda_error(cudaLaunchKernelEx(
+        &config, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
+        lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
+        supports_dsm));
+  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+    config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
+
+    check_cuda_error(cudaLaunchKernelEx(
+        &config, device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
+        lwe_dimension, polynomial_size, base_log, level_count, d_mem,
+        partial_dm, supports_dsm));
+  } else {
+    config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
+
+    check_cuda_error(cudaLaunchKernelEx(
+        &config, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+        lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
+        lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
+        supports_dsm));
+  }
+}
+
+// Verify if the grid size satisfies the cooperative group constraints
+template <typename Torus, class params>
+__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
+    int glwe_dimension, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+
+  // If Cooperative Groups is not supported, no need to check anything else
+  if (!cuda_check_support_cooperative_groups())
+    return false;
+
+  // Calculate the dimension of the kernel
+  uint64_t full_sm =
+      get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(params::degree);
+
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+          params::degree);
+
+  int thds = params::degree / params::opt;
+
+  // Get the maximum number of active blocks per streaming multiprocessors
+  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
+  int max_active_blocks_per_sm;
+
+  if (max_shared_memory < partial_sm) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_programmable_bootstrap_tbc<Torus, params, NOSM>, thds,
+        0);
+  } else if (max_shared_memory < full_sm) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        thds, partial_sm);
+  } else {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_programmable_bootstrap_tbc<Torus, params, FULLSM>, thds,
+        full_sm);
+  }
+
+  // Get the number of streaming multiprocessors
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
+}
+
+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_classic_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory) {
+  uint64_t minimum_sm =
+      get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
+          polynomial_size);
+
+  if (max_shared_memory < minimum_sm) {
+    // If we cannot store a single polynomial in a block shared memory we cannot
+    // use TBC
+    return false;
+  } else {
+    return cuda_check_support_thread_block_clusters();
+  }
+}
+
+template <typename Torus, class params>
+__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory) {
+
+  if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
+    return false;
+
+  uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+      polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+          polynomial_size);
+  uint64_t minimum_sm_tbc = 0;
+  if (supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+          Torus>(polynomial_size, max_shared_memory))
+    minimum_sm_tbc =
+        get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+  int cluster_size;
+
+  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  cudaLaunchConfig_t config = {0};
+  // The grid dimension is not affected by cluster launch, and is still
+  // enumerated using number of blocks. The grid dimension should be a multiple
+  // of cluster size.
+  config.gridDim = grid_accumulate;
+  config.blockDim = thds;
+  config.numAttrs = 0;
+
+  if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, true));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
+        &config));
+  } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, true));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size,
+        device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>, &config));
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, true));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
+        &config));
+  }
+
+  return cluster_size >= level_count * (glwe_dimension + 1);
+}
+
+#endif // CG_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -0,0 +1,480 @@
+#ifndef CUDA_TBC_MULTIBIT_PBS_CUH
+#define CUDA_TBC_MULTIBIT_PBS_CUH
+
+#include "cooperative_groups.h"
+#include "crypto/gadget.cuh"
+#include "crypto/ggsw.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "programmable_bootstrap.cuh"
+#include "programmable_bootstrap.h"
+#include "programmable_bootstrap_multibit.cuh"
+#include "types/complex/operations.cuh"
+#include <vector>
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
+    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
+    uint32_t lwe_offset, uint32_t lwe_chunk_size,
+    uint32_t keybundle_size_per_input, int8_t *device_mem,
+    uint64_t device_memory_size_per_block, bool support_dsm) {
+
+  cluster_group cluster = this_cluster();
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  if constexpr (SMD == FULLSM) {
+    // The first (polynomial_size/2) * sizeof(double2) bytes are reserved for
+    // external product using distributed shared memory
+    selected_memory = sharedmem;
+    if (support_dsm)
+      selected_memory += sizeof(Torus) * polynomial_size;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  Torus *accumulator = (Torus *)selected_memory;
+  double2 *accumulator_fft =
+      (double2 *)accumulator +
+      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
+
+  if constexpr (SMD == PARTIALSM) {
+    accumulator_fft = (double2 *)sharedmem;
+    if (support_dsm)
+      accumulator_fft += sizeof(double2) * (polynomial_size / 2);
+  }
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
+
+  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
+                                        params::degree * (glwe_dimension + 1)];
+
+  double2 *block_join_buffer =
+      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
+                   params::degree / 2];
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+
+  double2 *keybundle = keybundle_array +
+                       // select the input
+                       blockIdx.z * keybundle_size_per_input;
+
+  if (lwe_offset == 0) {
+    // Put "b" in [0, 2N[
+    Torus b_hat = 0;
+    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                          2 * params::degree);
+
+    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                          params::degree / params::opt>(
+        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+        false);
+  } else {
+    // Load the accumulator calculated in previous iterations
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        global_slice, accumulator);
+  }
+
+  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    round_to_closest_multiple_inplace<Torus, params::opt,
+                                      params::degree / params::opt>(
+        accumulator, base_log, level_count);
+
+    // Decompose the accumulator. Each block gets one level of the
+    // decomposition, for the mask and the body (so block 0 will have the
+    // accumulator decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
+
+    // We are using the same memory space for accumulator_fft and
+    // accumulator_rotated, so we need to synchronize here to make sure they
+    // don't modify the same memory space at the same time
+    synchronize_threads_in_block();
+
+    // Perform G^-1(ACC) * GGSW -> GLWE
+    mul_ggsw_glwe<Torus, cluster_group, params>(
+        accumulator, accumulator_fft, block_join_buffer, keybundle,
+        polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
+
+    synchronize_threads_in_block();
+  }
+
+  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+    auto block_lwe_array_out =
+        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                           (glwe_dimension * polynomial_size + 1) +
+                       blockIdx.y * polynomial_size];
+
+    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    }
+  } else {
+    // Load the accumulator calculated in previous iterations
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        accumulator, global_slice);
+  }
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size; // distributed shared memory
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size; // accumulator
+}
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size * 2; // accumulator
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_tbc_multi_bit_programmable_bootstrap(
+    cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t grouping_factor, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
+
+  cudaSetDevice(stream->gpu_index);
+
+  bool supports_dsm =
+      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
+          Torus>(polynomial_size, max_shared_memory);
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_tbc_accumulate =
+      get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t partial_sm_tbc_accumulate =
+      get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t minimum_sm_tbc_accumulate = 0;
+  if (supports_dsm)
+    minimum_sm_tbc_accumulate =
+        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+  if (max_shared_memory < full_sm_keybundle) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
+                                                          FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (max_shared_memory <
+      partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               NOSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        minimum_sm_tbc_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               NOSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory <
+             full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
+    cudaFuncSetCacheConfig(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (!lwe_chunk_size)
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        stream->gpu_index, input_lwe_ciphertext_count, polynomial_size,
+        max_shared_memory);
+  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
+      stream, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
+      allocate_gpu_memory);
+}
+
+template <typename Torus, class params>
+__host__ void execute_tbc_external_product_loop(
+    cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
+    uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t lwe_chunk_size, uint32_t max_shared_memory,
+    int lwe_offset) {
+
+  auto supports_dsm =
+      supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
+          Torus>(polynomial_size, max_shared_memory);
+
+  uint64_t full_dm =
+      get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t partial_dm =
+      get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t minimum_dm = 0;
+  if (supports_dsm)
+    minimum_dm =
+        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+  uint32_t keybundle_size_per_input =
+      lwe_chunk_size * level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2);
+
+  uint32_t chunk_size =
+      std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+
+  auto d_mem = buffer->d_mem_acc_tbc;
+  auto keybundle_fft = buffer->keybundle_fft;
+  auto global_accumulator = buffer->global_accumulator;
+  auto buffer_fft = buffer->global_accumulator_fft;
+
+  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  cudaLaunchConfig_t config = {0};
+  // The grid dimension is not affected by cluster launch, and is still
+  // enumerated using number of blocks. The grid dimension should be a multiple
+  // of cluster size.
+  config.gridDim = grid_accumulate;
+  config.blockDim = thds;
+
+  cudaLaunchAttribute attribute[1];
+  attribute[0].id = cudaLaunchAttributeClusterDimension;
+  attribute[0].val.clusterDim.x = level_count; // Cluster size in X-dimension
+  attribute[0].val.clusterDim.y = (glwe_dimension + 1);
+  attribute[0].val.clusterDim.z = 1;
+  config.attrs = attribute;
+  config.numAttrs = 1;
+  config.stream = stream->stream;
+
+  if (max_shared_memory < partial_dm + minimum_dm) {
+    config.dynamicSmemBytes = minimum_dm;
+    check_cuda_error(cudaLaunchKernelEx(
+        &config,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               NOSM>,
+        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
+        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
+        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
+        keybundle_size_per_input, d_mem, full_dm, supports_dsm));
+  } else if (max_shared_memory < full_dm + minimum_dm) {
+    config.dynamicSmemBytes = partial_dm + minimum_dm;
+    check_cuda_error(cudaLaunchKernelEx(
+        &config,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               PARTIALSM>,
+        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
+        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
+        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
+        keybundle_size_per_input, d_mem, partial_dm, supports_dsm));
+  } else {
+    config.dynamicSmemBytes = full_dm + minimum_dm;
+    check_cuda_error(cudaLaunchKernelEx(
+        &config,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               FULLSM>,
+        lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+        lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
+        global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
+        base_log, level_count, grouping_factor, lwe_offset, chunk_size,
+        keybundle_size_per_input, d_mem, 0, supports_dsm));
+  }
+}
+
+template <typename Torus, typename STorus, class params>
+__host__ void host_tbc_multi_bit_programmable_bootstrap(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0) {
+  cudaSetDevice(stream->gpu_index);
+
+  if (!lwe_chunk_size)
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        stream->gpu_index, num_samples, polynomial_size, max_shared_memory);
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    // Compute a keybundle
+    execute_compute_keybundle<Torus, params>(
+        stream, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer,
+        num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, max_shared_memory,
+        lwe_chunk_size, lwe_offset);
+
+    // Accumulate
+    execute_tbc_external_product_loop<Torus, params>(
+        stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+        lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        lwe_chunk_size, max_shared_memory, lwe_offset);
+  }
+}
+
+template <typename Torus>
+__host__ bool
+supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
+    uint32_t polynomial_size, uint32_t max_shared_memory) {
+  uint64_t minimum_sm =
+      get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+
+  if (max_shared_memory <= minimum_sm) {
+    // If we cannot store a single polynomial in a block shared memory we
+    // cannot use TBC
+    return false;
+  } else {
+    return cuda_check_support_thread_block_clusters();
+  }
+}
+
+template <typename Torus, class params>
+__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
+    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_shared_memory) {
+
+  if (!cuda_check_support_thread_block_clusters())
+    return false;
+
+  uint64_t full_sm_tbc_accumulate =
+      get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t partial_sm_tbc_accumulate =
+      get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
+          polynomial_size);
+  uint64_t minimum_sm_tbc_accumulate = 0;
+  if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
+          Torus>(polynomial_size, max_shared_memory))
+    minimum_sm_tbc_accumulate =
+        get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
+            polynomial_size);
+
+  int cluster_size;
+
+  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  cudaLaunchConfig_t config = {0};
+  // The grid dimension is not affected by cluster launch, and is still
+  // enumerated using number of blocks. The grid dimension should be a multiple
+  // of cluster size.
+  config.gridDim = grid_accumulate;
+  config.blockDim = thds;
+  config.numAttrs = 0;
+
+  if (max_shared_memory <
+      partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               NOSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, true));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               NOSM>,
+        &config));
+  } else if (max_shared_memory <
+             full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               PARTIALSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, true));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               PARTIALSM>,
+        &config));
+  } else {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               FULLSM>,
+        cudaFuncAttributeNonPortableClusterSizeAllowed, true));
+    check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
+        &cluster_size,
+        device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
+                                                               FULLSM>,
+        &config));
+  }
+
+  return cluster_size >= level_count * (glwe_dimension + 1);
+}
+
+template __host__ bool
+supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
+    uint32_t polynomial_size, uint32_t max_shared_memory);
+#endif // FASTMULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/parameters.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/parameters.cuh
@@ -50,12 +50,7 @@ public:
  constexpr static int opt = choose_opt_amortized(N);
  constexpr static int log2_degree = log2(N);
 };
-enum sharedMemDegree {
-  NOSM = 0,
-  PARTIALSM = 1,
-  FULLSM = 2
-
-};
+enum sharedMemDegree { NOSM = 0, PARTIALSM = 1, FULLSM = 2 };

 class ForwardFFT {
 public:
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -167,6 +167,38 @@ public:
  }
 };

+#if CUDA_ARCH >= 900
+BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
+(benchmark::State &st) {
+  if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
+          pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
+    st.SkipWithError("Configuration not supported for tbc operation");
+    return;
+  }
+
+  scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
+      stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_level, grouping_factor,
+      input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
+      true, chunk_size);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
+        stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
+        d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
+        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
+        pbs_level, input_lwe_ciphertext_count, 1, 0,
+        cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
+    cuda_synchronize_stream(stream);
+  }
+
+  cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
+}
+#endif
+
 BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
 (benchmark::State &st) {
  if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
@@ -221,6 +253,39 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
  cleanup_cuda_multi_bit_programmable_bootstrap(stream, &buffer);
 }

+#if CUDA_ARCH >= 900
+BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
+(benchmark::State &st) {
+  if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
+          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
+          pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
+    st.SkipWithError("Configuration not supported for tbc operation");
+    return;
+  }
+
+  scratch_cuda_programmable_bootstrap_tbc<uint64_t, int64_t>(
+      stream, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer, glwe_dimension,
+      polynomial_size, pbs_level, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(stream->gpu_index), true);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
+        stream, (uint64_t *)d_lwe_ct_out_array,
+        (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
+        (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
+        (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
+        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        input_lwe_ciphertext_count, 1, 0,
+        cuda_get_max_shared_memory(stream->gpu_index));
+    cuda_synchronize_stream(stream);
+  }
+
+  cleanup_cuda_programmable_bootstrap(stream, &buffer);
+}
+#endif
+
 BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
 (benchmark::State &st) {
  if (!has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
@@ -317,6 +382,9 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
  for (auto x : params) {
    for (int input_lwe_ciphertext_count = 1; input_lwe_ciphertext_count <= 4096;
         input_lwe_ciphertext_count *= 2) {
+      b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
+               x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
+               x.grouping_factor, 0});
      for (int lwe_chunk_size = 1;
           lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
           lwe_chunk_size *= 2)
@@ -332,6 +400,25 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
  }
 }

+static void
+CGBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
+  // Define the parameters to benchmark
+  // lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+  // input_lwe_ciphertext_count
+
+  // PARAM_MESSAGE_2_CARRY_2_KS_PBS
+  std::vector<BootstrapBenchmarkParams> params = {
+      (BootstrapBenchmarkParams){742, 1, 2048, 23, 1, 1},
+  };
+
+  // Add to the list of parameters to benchmark
+  for (int num_samples = 1; num_samples <= 4096; num_samples *= 2)
+    for (auto x : params) {
+      b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
+               x.pbs_base_log, x.pbs_level, num_samples});
+    }
+}
+
 static void
 BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
  // Define the parameters to benchmark
@@ -351,6 +438,14 @@ BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
    }
 }

+#if CUDA_ARCH >= 900
+BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, TbcMultiBit)
+    ->Apply(MultiBitPBSBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
+                "grouping_factor", "chunk_size"});
+#endif
+
 BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, CgMultiBit)
    ->Apply(MultiBitPBSBenchmarkGenerateParams)
    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
@@ -363,11 +458,23 @@ BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
                "grouping_factor", "chunk_size"});

+#if CUDA_ARCH >= 900
+BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, TbcPBC)
+    ->Apply(BootstrapBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
+#endif
+
 BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, DefaultPBS)
    ->Apply(BootstrapBenchmarkGenerateParams)
    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});

+BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, CgPBS)
+    ->Apply(BootstrapBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
+
 BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
    ->Apply(BootstrapBenchmarkGenerateParams)
    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/utils.h
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/utils.h
@@ -1,9 +1,9 @@
 #ifndef UTILS_H
 #define UTILS_H

+#include "tfhe.h"
 #include <device.h>
 #include <functional>
-#include <tfhe.h>

 typedef struct Seed {
  uint64_t lo;
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -205,7 +205,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
        uint64_t decrypted = 0;
        core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
                                glwe_dimension * polynomial_size);
-        EXPECT_NE(decrypted, plaintext);
+        ASSERT_NE(decrypted, plaintext);
        // let err = (decrypted >= plaintext) ? decrypted - plaintext :
        // plaintext
        // - decrypted;
@@ -216,7 +216,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
        // Compute the rounding bit
        uint64_t rounding = (decrypted & rounding_bit) << 1;
        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, plaintext / delta);
+        ASSERT_EQ(decoded, plaintext / delta);
      }
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -227,7 +227,13 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
            888, 1, 16384,
            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
-            128, 3, 1, 10});
+            128, 3, 1, 10},
+        (MultiBitProgrammableBootstrapTestParams){
+            972, 1, 8192,
+            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
+            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8,
+            68, 3, 1, 1});
+
 std::string printParamName(
    ::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {
  MultiBitProgrammableBootstrapTestParams params = p.param;
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -687,6 +687,37 @@ extern "C" {

    pub fn cleanup_cuda_full_propagation(v_stream: *const c_void, mem_ptr: *mut *mut i8);

+    pub fn scratch_cuda_apply_univariate_lut_kb_64(
+        v_stream: *const c_void,
+        mem_ptr: *mut *mut i8,
+        input_lut: *const c_void,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_apply_univariate_lut_kb_64(
+        v_stream: *const c_void,
+        output_radix_lwe: *mut c_void,
+        input_radix_lwe: *const c_void,
+        mem_ptr: *mut i8,
+        ksk: *const c_void,
+        bsk: *const c_void,
+        num_blocks: u32,
+    );
+
+    pub fn cleanup_cuda_apply_univariate_lut_kb_64(v_stream: *const c_void, mem_ptr: *mut *mut i8);
+
    pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
        v_stream: *const c_void,
        mem_ptr: *mut *mut i8,
@@ -959,4 +990,37 @@ extern "C" {
        mem_ptr: *mut *mut i8,
    );

+    pub fn scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+        v_stream: *const c_void,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_integer_div_rem_radix_ciphertext_kb_64(
+        v_stream: *const c_void,
+        quotient: *mut c_void,
+        remainder: *mut c_void,
+        numerator: *const c_void,
+        divisor: *const c_void,
+        mem_ptr: *mut i8,
+        bsk: *const c_void,
+        ksk: *const c_void,
+        num_blocks: u32,
+    );
+
+    pub fn cleanup_cuda_integer_div_rem(v_stream: *const c_void, mem_ptr: *mut *mut i8);
+
 } // extern "C"
--- a/ci/lattice_estimator.sage
+++ b/ci/lattice_estimator.sage
@@ -57,7 +57,7 @@ def check_security(filename):
            estimator_level = log(min(usvp_level["rop"], dual_level["rop"]),2 )
            security_level = f"security level = {estimator_level} bits"
            if estimator_level < 127:
-                print("FAIL\t({security_level})")
+                print(f"FAIL\t({security_level})")
                reason = f"attained {security_level} target is 128 bits"
                to_update.append((param, reason))
                continue
--- a/scripts/c_api_tests.sh
+++ b/scripts/c_api_tests.sh
@@ -8,11 +8,13 @@ function usage() {
    echo "--help                    Print this message"
    echo "--build-only              Pass to only build the tests without running them"
    echo "--gpu                     Enable GPU support"
+    echo "--cargo-profile           The profile used to build TFHE-rs, release by default"
    echo
 }

 BUILD_ONLY=0
 WITH_FEATURE_GPU="OFF"
+CARGO_PROFILE="release"
 while [ -n "$1" ]
 do
    case "$1" in
@@ -28,6 +30,12 @@ do
        "--gpu" )
            WITH_FEATURE_GPU="ON"
            ;;
+
+        "--cargo-profile" )
+            shift
+            CARGO_PROFILE="$1"
+            ;;
+
        *)
            echo "Unknown param : $1"
            exit 1
--- a/tfhe-zk-pok/src/curve_446/mod.rs
+++ b/tfhe-zk-pok/src/curve_446/mod.rs
@@ -1,3 +1,5 @@
+#![allow(non_local_definitions)]
+
 use ark_ec::bls12::{Bls12, Bls12Config, TwistType};
 use ark_ff::fields::*;
 use ark_ff::MontFp;
@@ -576,8 +578,8 @@ pub mod g1 {
                writer.write_all(&bytes)?;
            } else {
                let mut bytes = [0u8; 2 * G1_SERIALIZED_SIZE];
-                bytes[1..1 + G1_SERIALIZED_SIZE].copy_from_slice(&x_bytes[..]);
-                bytes[2 + G1_SERIALIZED_SIZE..].copy_from_slice(&serialize_fq(p.y)[..]);
+                bytes[1..G1_SERIALIZED_SIZE].copy_from_slice(&x_bytes[..]);
+                bytes[1 + G1_SERIALIZED_SIZE..].copy_from_slice(&serialize_fq(p.y)[..]);

                encoding.encode_flags(&mut bytes);
                writer.write_all(&bytes)?;
--- a/tfhe-zk-pok/src/proofs/pke.rs
+++ b/tfhe-zk-pok/src/proofs/pke.rs
@@ -91,6 +91,21 @@ pub struct PrivateCommit<G: Curve> {
    __marker: PhantomData<G>,
 }

+pub fn compute_crs_params(
+    d: usize,
+    k: usize,
+    b: u64,
+    _q: u64, // we keep q here to make sure the API is consistent with [crs_gen]
+    t: u64,
+) -> (usize, usize, u64) {
+    let b_r = d as u64 / 2 + 1;
+
+    let big_d =
+        d + k * t.ilog2() as usize + (d + k) * (2 + b.ilog2() as usize + b_r.ilog2() as usize);
+    let n = big_d + 1;
+    (n, big_d, b_r)
+}
+
 pub fn crs_gen<G: Curve>(
    d: usize,
    k: usize,
@@ -100,11 +115,7 @@ pub fn crs_gen<G: Curve>(
    rng: &mut dyn RngCore,
 ) -> PublicParams<G> {
    let alpha = G::Zp::rand(rng);
-    let b_r = d as u64 / 2 + 1;
-
-    let big_d =
-        d + k * t.ilog2() as usize + (d + k) * (2 + b.ilog2() as usize + b_r.ilog2() as usize);
-    let n = big_d + 1;
+    let (n, big_d, b_r) = compute_crs_params(d, k, b, q, t);
    PublicParams {
        g_lists: GroupElements::<G>::new(n, alpha),
        big_d,
--- a/tfhe/benches/core_crypto/ks_bench.rs
+++ b/tfhe/benches/core_crypto/ks_bench.rs
@@ -11,6 +11,7 @@ use tfhe::keycache::NamedParam;
 use tfhe::shortint::parameters::{
    PARAM_GPU_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
    PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
+    PARAM_GPU_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
 };
 #[cfg(not(feature = "gpu"))]
 use tfhe::shortint::parameters::{
@@ -42,9 +43,10 @@ const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 6] = [
 ];

 #[cfg(feature = "gpu")]
-const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 2] = [
+const SHORTINT_MULTI_BIT_BENCH_PARAMS: [MultiBitPBSParameters; 3] = [
    PARAM_GPU_MULTI_BIT_MESSAGE_1_CARRY_1_GROUP_3_KS_PBS,
    PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS,
+    PARAM_GPU_MULTI_BIT_MESSAGE_3_CARRY_3_GROUP_3_KS_PBS,
 ];

 const BOOLEAN_BENCH_PARAMS: [(&str, BooleanParameters); 2] = [
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
@@ -39,9 +39,9 @@ impl Default for ParamsAndNumBlocksIter {

        if env_config.is_multi_bit {
            #[cfg(feature = "gpu")]
-            let params = vec![PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];
+            let params = vec![PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
            #[cfg(not(feature = "gpu"))]
-            let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
+            let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];

            let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
            Self {
@@ -1538,6 +1538,11 @@ mod cuda {
        display_name: mul
    );

+    define_cuda_server_key_bench_clean_input_fn!(
+        method_name: unchecked_div_rem,
+        display_name: div_rem
+    );
+
    define_cuda_server_key_bench_clean_input_fn!(
        method_name: unchecked_add,
        display_name: add
@@ -1720,6 +1725,11 @@ mod cuda {
        display_name: mul
    );

+    define_cuda_server_key_bench_clean_input_fn!(
+        method_name: div_rem,
+        display_name: div_rem
+    );
+
    define_cuda_server_key_bench_clean_input_fn!(
        method_name: ne,
        display_name: not_equal
@@ -1898,6 +1908,7 @@ mod cuda {
        cuda_unchecked_bitor,
        cuda_unchecked_bitxor,
        cuda_unchecked_mul,
+        cuda_unchecked_div_rem,
        cuda_unchecked_sub,
        cuda_unchecked_unsigned_overflowing_sub,
        cuda_unchecked_add,
@@ -1938,6 +1949,7 @@ mod cuda {
        cuda_unsigned_overflowing_sub,
        cuda_add,
        cuda_mul,
+        cuda_div_rem,
        cuda_eq,
        cuda_ne,
        cuda_ge,
@@ -1982,7 +1994,7 @@ mod cuda {
        display_name: &str,
        cast_op: F,
    ) where
-        F: Fn(&CudaServerKey, CudaUnsignedRadixCiphertext, usize),
+        F: Fn(&CudaServerKey, CudaUnsignedRadixCiphertext, usize, &CudaStream),
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -2021,7 +2033,7 @@ mod cuda {
                    b.iter_batched(
                        encrypt_one_value,
                        |ct| {
-                            cast_op(&gpu_sks, ct, target_num_blocks);
+                            cast_op(&gpu_sks, ct, target_num_blocks, &stream);
                        },
                        criterion::BatchSize::SmallInput,
                    )
@@ -2046,12 +2058,12 @@ mod cuda {
        (method_name: $server_key_method:ident, display_name:$name:ident) => {
            ::paste::paste!{
                fn [<cuda_ $server_key_method>](c: &mut Criterion) {
-                    bench_server_key_cast_function(
+                    cuda_bench_server_key_cast_function(
                        c,
                        concat!("integer::cuda::", stringify!($server_key_method)),
                        stringify!($name),
-                        |server_key, lhs, rhs| {
-                            server_key.$server_key_method(lhs, rhs);
+                        |server_key, lhs, rhs, stream| {
+                            server_key.$server_key_method(lhs, rhs, stream);
                        })
                }
            }
@@ -2406,7 +2418,7 @@ fn go_through_gpu_bench_groups(val: &str) {
        "default" => {
            default_cuda_ops();
            default_scalar_cuda_ops();
-            cuda_cast_ops()
+            cuda_cast_ops();
        }
        "unchecked" => {
            unchecked_cuda_ops();
--- a/tfhe/benches/integer/signed_bench.rs
+++ b/tfhe/benches/integer/signed_bench.rs
@@ -826,7 +826,7 @@ macro_rules! define_server_key_bench_binary_scalar_clean_inputs_fn (
        fn $server_key_method(c: &mut Criterion) {
            bench_server_key_binary_scalar_function_clean_inputs(
                c,
-                concat!("integer::", stringify!($server_key_method)),
+                concat!("integer::signed::", stringify!($server_key_method)),
                stringify!($name),
                |server_key, lhs, rhs| {
                    server_key.$server_key_method(lhs, rhs);
@@ -2147,11 +2147,106 @@ mod cuda {
        cuda_scalar_min,
        cuda_scalar_max,
    );
+
+    fn cuda_bench_server_key_signed_cast_function<F>(
+        c: &mut Criterion,
+        bench_name: &str,
+        display_name: &str,
+        cast_op: F,
+    ) where
+        F: Fn(&CudaServerKey, CudaSignedRadixCiphertext, usize, &CudaStream),
+    {
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(30));
+        let mut rng = rand::thread_rng();
+
+        let env_config = EnvConfig::new();
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (param, num_blocks, bit_size) in ParamsAndNumBlocksIter::default() {
+            let all_num_blocks = env_config
+                .bit_sizes()
+                .iter()
+                .copied()
+                .map(|bit| bit.div_ceil(param.message_modulus().0.ilog2() as usize))
+                .collect::<Vec<_>>();
+            let param_name = param.name();
+
+            for target_num_blocks in all_num_blocks.iter().copied() {
+                let target_bit_size =
+                    target_num_blocks * param.message_modulus().0.ilog2() as usize;
+                let bench_id =
+                    format!("{bench_name}::{param_name}::{bit_size}_to_{target_bit_size}");
+                bench_group.bench_function(&bench_id, |b| {
+                    let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                    let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                    let encrypt_one_value = || -> CudaSignedRadixCiphertext {
+                        let ct = cks.encrypt_signed_radix(gen_random_i256(&mut rng), num_blocks);
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &stream)
+                    };
+
+                    b.iter_batched(
+                        encrypt_one_value,
+                        |ct| {
+                            cast_op(&gpu_sks, ct, target_num_blocks, &stream);
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+
+                write_to_json::<u64, _>(
+                    &bench_id,
+                    param,
+                    param.name(),
+                    display_name,
+                    &OperatorType::Atomic,
+                    bit_size as u32,
+                    vec![param.message_modulus().0.ilog2(); num_blocks],
+                );
+            }
+        }
+
+        bench_group.finish()
+    }
+
+    macro_rules! define_cuda_server_key_bench_signed_cast_fn (
+        (method_name: $server_key_method:ident, display_name:$name:ident) => {
+            ::paste::paste!{
+                fn [<cuda_ $server_key_method>](c: &mut Criterion) {
+                    cuda_bench_server_key_signed_cast_function(
+                        c,
+                        concat!("integer::cuda::signed::", stringify!($server_key_method)),
+                        stringify!($name),
+                        |server_key, lhs, rhs, stream| {
+                            server_key.$server_key_method(lhs, rhs, stream);
+                        })
+                }
+            }
+        }
+    );
+
+    define_cuda_server_key_bench_signed_cast_fn!(
+        method_name: cast_to_unsigned,
+        display_name: cast_to_unsigned
+    );
+
+    define_cuda_server_key_bench_signed_cast_fn!(
+        method_name: cast_to_signed,
+        display_name: cast_to_signed
+    );
+
+    criterion_group!(cuda_cast_ops, cuda_cast_to_unsigned, cuda_cast_to_signed);
 }

 #[cfg(feature = "gpu")]
 use cuda::{
-    default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops,
+    cuda_cast_ops, default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops,
+    unchecked_scalar_cuda_ops,
 };

 #[cfg(feature = "gpu")]
@@ -2160,6 +2255,7 @@ fn go_through_gpu_bench_groups(val: &str) {
        "default" => {
            default_cuda_ops();
            default_scalar_cuda_ops();
+            cuda_cast_ops();
        }
        "unchecked" => {
            unchecked_cuda_ops();
--- a/tfhe/c_api_tests/test_c_doc.c
+++ b/tfhe/c_api_tests/test_c_doc.c
@@ -1,8 +1,8 @@
 // If this test break the c_api doc needs to be updated

+#include "tfhe.h"
 #include <assert.h>
 #include <stdio.h>
-#include <tfhe.h>

 int main(void) {
  int ok = 0;
--- a/tfhe/c_api_tests/test_high_level_128_bits.c
+++ b/tfhe/c_api_tests/test_high_level_128_bits.c
@@ -1,4 +1,4 @@
-#include <tfhe.h>
+#include "tfhe.h"

 #include <assert.h>
 #include <inttypes.h>
--- a/tfhe/c_api_tests/test_high_level_256_bits.c
+++ b/tfhe/c_api_tests/test_high_level_256_bits.c
@@ -1,4 +1,4 @@
-#include <tfhe.h>
+#include "tfhe.h"

 #include <assert.h>
 #include <inttypes.h>
--- a/tfhe/c_api_tests/test_high_level_boolean.c
+++ b/tfhe/c_api_tests/test_high_level_boolean.c
@@ -1,4 +1,4 @@
-#include <tfhe.h>
+#include "tfhe.h"

 #include <assert.h>
 #include <inttypes.h>
--- a/tfhe/c_api_tests/test_high_level_custom_integers.c
+++ b/tfhe/c_api_tests/test_high_level_custom_integers.c
@@ -1,4 +1,4 @@
-#include <tfhe.h>
+#include "tfhe.h"

 #include <assert.h>
 #include <inttypes.h>
--- a/tfhe/c_api_tests/test_high_level_integers.c
+++ b/tfhe/c_api_tests/test_high_level_integers.c
@@ -1,4 +1,4 @@
-#include <tfhe.h>
+#include "tfhe.h"

 #include <assert.h>
 #include <inttypes.h>
--- a/tfhe/c_api_tests/test_high_level_integers_cuda.c
+++ b/tfhe/c_api_tests/test_high_level_integers_cuda.c
@@ -1,5 +1,5 @@
 #if defined(WITH_FEATURE_GPU)
-#include <tfhe.h>
+#include "tfhe.h"

 #include <assert.h>
 #include <inttypes.h>
--- a/tfhe/c_api_tests/test_high_level_threading.c
+++ b/tfhe/c_api_tests/test_high_level_threading.c
@@ -24,7 +24,7 @@
 *      - NUM_CPU: number of CPU threads (tfhe internally automatically creates them)
 */

-#include <tfhe.h>
+#include "tfhe.h"

 #include <assert.h>
 #include <pthread.h>
--- a/tfhe/c_api_tests/test_high_level_zk.c
+++ b/tfhe/c_api_tests/test_high_level_zk.c
@@ -1,7 +1,6 @@
 #include "tfhe.h"
 #include <assert.h>
 #include <stdlib.h>
-#include <tfhe.h>

 int main(void) {
  // We want to use zk-proof, which requires bounded random distributions
--- a/tfhe/docs/fundamentals/configure-and-generate-keys.md
+++ b/tfhe/docs/fundamentals/configure-and-generate-keys.md
@@ -4,7 +4,7 @@ The first step is the creation of the configuration. The configuration is used t

 A configuration can be created by using the ConfigBuilder type.

-In this example, 8-bit unsigned integers with default parameters are used. The `integers` feature must also be enabled, as per the table on [this page](broken-reference).
+In this example, 8-bit unsigned integers with default parameters are used. The `integers` feature must also be enabled, as per the table on [this page](../guides/rust\_configuration.md#homomorphic-types).

 The config is generated by first creating a builder with all types deactivated. Then, the integer types with default parameters are activated, since we are going to use FheUint8 values.

--- a/tfhe/docs/getting_started/quick_start.md
+++ b/tfhe/docs/getting_started/quick_start.md
@@ -3,7 +3,7 @@
 The basic steps for using the high-level API of TFHE-rs are:

 1. [Importing the TFHE-rs prelude](quick\_start.md#imports);
-2. Client-side: [Configuring and creating keys](../fundamentals/configure-and-create-keys.md);
+2. Client-side: [Configuring and creating keys](../fundamentals/configure-and-generate-keys.md);
 3. Client-side: [Encrypting data](../fundamentals/encrypt-data.md);
 4. Server-side: [Setting the server key](../fundamentals/set-the-server-key.md);
 5. Server-side: [Computing over encrypted data](../fundamentals/compute.md);
--- a/tfhe/docs/guides/c_api.md
+++ b/tfhe/docs/guides/c_api.md
@@ -79,7 +79,7 @@ $

 ```c

-#include <tfhe.h>
+#include "tfhe.h"
 #include <assert.h>
 #include <stdio.h>

--- a/tfhe/docs/guides/js_on_wasm_api.md
+++ b/tfhe/docs/guides/js_on_wasm_api.md
@@ -35,8 +35,7 @@ function fhe_uint32_example() {
    init_panic_hook();

    const block_params = new ShortintParameters(ShortintParametersName.PARAM_SMALL_MESSAGE_2_CARRY_2_COMPACT_PK);
-    let config = TfheConfigBuilder.all_disabled()
-        .enable_default_integers()
+    let config = TfheConfigBuilder.default()
        .build();

    let clientKey = TfheClientKey.generate(config);
--- a/tfhe/docs/guides/run_on_gpu.md
+++ b/tfhe/docs/guides/run_on_gpu.md
@@ -201,7 +201,7 @@ All operations follow the same syntax than the one described in [here](../gettin
 # Benchmarks

 All GPU benchmarks presented here were obtained on a single H100 GPU, and rely on the multithreaded PBS algorithm.
-The cryptographic parameters PARAM\_GPU\_MULTI\_BIT\_MESSAGE\_1\_CARRY\_2\_GROUP\_3\_KS\_PBS were used.
+The cryptographic parameters PARAM\_GPU\_MULTI\_BIT\_MESSAGE\_2\_CARRY\_2\_GROUP\_3\_KS\_PBS were used.
 Performance is the following when the inputs of the benchmarked operation are encrypted:

 | Operation \ Size                                       | `FheUint7` | `FheUint16` | `FheUint32` | `FheUint64` | `FheUint128` | `FheUint256` |
--- a/tfhe/src/c_api/core_crypto/mod.rs
+++ b/tfhe/src/c_api/core_crypto/mod.rs
@@ -170,10 +170,10 @@ pub unsafe extern "C" fn core_crypto_lwe_encrypt(
        let seed = (seed_high_bytes << 64) | seed_low_bytes;

        let seed = Seed(seed);
-        let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
+        let mut deterministic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
        let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
-            determinisitic_seeder.seed(),
-            &mut determinisitic_seeder,
+            deterministic_seeder.seed(),
+            &mut deterministic_seeder,
        );

        let plaintext = Plaintext(pt);
@@ -219,10 +219,10 @@ pub unsafe extern "C" fn core_crypto_ggsw_encrypt(
        let seed = (seed_high_bytes << 64) | seed_low_bytes;

        let seed = Seed(seed);
-        let mut determinisitic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
+        let mut deterministic_seeder = DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
        let mut encryption_generator = EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
-            determinisitic_seeder.seed(),
-            &mut determinisitic_seeder,
+            deterministic_seeder.seed(),
+            &mut deterministic_seeder,
        );

        let plaintext = Plaintext(pt);
--- a/tfhe/src/core_crypto/algorithms/lwe_encryption.rs
+++ b/tfhe/src/core_crypto/algorithms/lwe_encryption.rs
@@ -3246,7 +3246,12 @@ mod test {

        let mut thread_rng = rand::thread_rng();

-        for _ in 0..10_000 {
+        #[cfg(not(tarpaulin))]
+        let nb_tests = 10_000;
+        #[cfg(tarpaulin)]
+        let nb_tests = 1;
+
+        for _ in 0..nb_tests {
            let lwe_sk =
                LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);

@@ -3305,7 +3310,12 @@ mod test {

        let mut thread_rng = rand::thread_rng();

-        for _ in 0..100 {
+        #[cfg(not(tarpaulin))]
+        let nb_tests = 100;
+        #[cfg(tarpaulin)]
+        let nb_tests = 1;
+
+        for _ in 0..nb_tests {
            // We'll encrypt between 1 and 4 * lwe_dimension ciphertexts
            let ct_count: usize = thread_rng.gen();
            let ct_count = ct_count % (lwe_dimension.0 * 4) + 1;
--- a/tfhe/src/core_crypto/algorithms/misc.rs
+++ b/tfhe/src/core_crypto/algorithms/misc.rs
@@ -206,7 +206,10 @@ where
 mod test {
    use super::*;

+    #[cfg(not(tarpaulin))]
    const NB_TESTS: usize = 1_000_000_000;
+    #[cfg(tarpaulin)]
+    const NB_TESTS: usize = 1;

    #[test]
    fn test_divide_funcs() {
@@ -295,7 +298,7 @@ mod test {
            use rand::Rng;
            let mut rng = rand::thread_rng();

-            for _ in 0..1_000_000_000 {
+            for _ in 0..NB_TESTS {
                let value: u64 = rng.gen();
                // This is an easy case where we expect the values to match exactly, to cover other
                // cases we would be re coding the algorithms here.
--- a/tfhe/src/core_crypto/algorithms/test/lwe_encryption.rs
+++ b/tfhe/src/core_crypto/algorithms/test/lwe_encryption.rs
@@ -993,7 +993,10 @@ fn lwe_compact_public_encrypt_decrypt_custom_mod<Scalar: UnsignedTorus>(
 }

 create_parametrized_test!(lwe_compact_public_encrypt_decrypt_custom_mod {
-    TEST_PARAMS_4_BITS_NATIVE_U64
+    #[cfg(not(tarpaulin))]
+    TEST_PARAMS_4_BITS_NATIVE_U64,
+    #[cfg(tarpaulin)]
+    COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64
 });

 #[cfg(feature = "zk-pok-experimental")]
@@ -1095,7 +1098,10 @@ fn lwe_compact_public_encrypt_prove_verify_decrypt_custom_mod<Scalar>(

 #[cfg(feature = "zk-pok-experimental")]
 create_parametrized_test!(lwe_compact_public_encrypt_prove_verify_decrypt_custom_mod {
-    TEST_PARAMS_4_BITS_NATIVE_U64
+    #[cfg(not(tarpaulin))]
+    TEST_PARAMS_4_BITS_NATIVE_U64,
+    #[cfg(tarpaulin)]
+    COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64
 });

 #[cfg(feature = "zk-pok-experimental")]
@@ -1316,3 +1322,219 @@ fn test_par_compact_lwe_list_public_key_encryption_and_proof() {
        assert_eq!(ser_lwe_ct_list, par_lwe_ct_list);
    }
 }
+
+#[test]
+fn test_compact_public_key_encryption() {
+    use rand::Rng;
+
+    #[cfg(not(tarpaulin))]
+    let nb_tests = 10_000;
+    #[cfg(tarpaulin)]
+    let nb_tests = 100;
+
+    #[cfg(not(tarpaulin))]
+    let lwe_dimension = LweDimension(2048);
+    #[cfg(tarpaulin)]
+    let lwe_dimension = LweDimension(1);
+
+    let glwe_noise_distribution =
+        Gaussian::from_dispersion_parameter(StandardDev(0.00000000000000029403601535432533), 0.0);
+    let ciphertext_modulus = CiphertextModulus::new_native();
+
+    let mut secret_random_generator = test_tools::new_secret_random_generator();
+    let mut encryption_random_generator = test_tools::new_encryption_random_generator();
+
+    let mut thread_rng = rand::thread_rng();
+
+    for _ in 0..nb_tests {
+        let lwe_sk = LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
+
+        let mut compact_lwe_pk = LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
+
+        generate_lwe_compact_public_key(
+            &lwe_sk,
+            &mut compact_lwe_pk,
+            glwe_noise_distribution,
+            &mut encryption_random_generator,
+        );
+
+        let msg: u64 = thread_rng.gen();
+        let msg = msg % 16;
+
+        let plaintext = Plaintext(msg << 60);
+
+        let mut output_ct = LweCiphertext::new(
+            0u64,
+            lwe_dimension.to_lwe_size(),
+            CiphertextModulus::new_native(),
+        );
+
+        encrypt_lwe_ciphertext_with_compact_public_key(
+            &compact_lwe_pk,
+            &mut output_ct,
+            plaintext,
+            glwe_noise_distribution,
+            glwe_noise_distribution,
+            &mut secret_random_generator,
+            &mut encryption_random_generator,
+        );
+
+        let decrypted_plaintext = decrypt_lwe_ciphertext(&lwe_sk, &output_ct);
+
+        let signed_decomposer =
+            SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
+
+        let cleartext = signed_decomposer.closest_representable(decrypted_plaintext.0) >> 60;
+
+        assert_eq!(cleartext, msg);
+    }
+}
+
+#[test]
+fn test_par_compact_lwe_list_public_key_encryption_equivalence() {
+    use rand::Rng;
+
+    let lwe_dimension = LweDimension(2048);
+    let glwe_noise_distribution =
+        Gaussian::from_dispersion_parameter(StandardDev(0.00000000000000029403601535432533), 0.0);
+    let ciphertext_modulus = CiphertextModulus::new_native();
+
+    let mut thread_rng = rand::thread_rng();
+
+    for _ in 0..NB_TESTS {
+        // We'll encrypt between 1 and 4 * lwe_dimension ciphertexts
+        let ct_count: usize = thread_rng.gen();
+        let ct_count = ct_count % (lwe_dimension.0 * 4) + 1;
+        let lwe_ciphertext_count = LweCiphertextCount(ct_count);
+
+        let seed = test_tools::random_seed();
+        let mut input_plaintext_list =
+            PlaintextList::new(0u64, PlaintextCount(lwe_ciphertext_count.0));
+        input_plaintext_list.iter_mut().for_each(|x| {
+            let msg: u64 = thread_rng.gen();
+            *x.0 = (msg % 16) << 60;
+        });
+
+        let par_lwe_ct_list = {
+            let mut deterministic_seeder =
+                DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
+            let mut secret_random_generator =
+                SecretRandomGenerator::<ActivatedRandomGenerator>::new(deterministic_seeder.seed());
+            let mut encryption_random_generator =
+                EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
+                    deterministic_seeder.seed(),
+                    &mut deterministic_seeder,
+                );
+
+            let lwe_sk =
+                LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
+
+            let mut compact_lwe_pk =
+                LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
+
+            generate_lwe_compact_public_key(
+                &lwe_sk,
+                &mut compact_lwe_pk,
+                glwe_noise_distribution,
+                &mut encryption_random_generator,
+            );
+
+            let mut output_compact_ct_list = LweCompactCiphertextList::new(
+                0u64,
+                lwe_dimension.to_lwe_size(),
+                lwe_ciphertext_count,
+                ciphertext_modulus,
+            );
+
+            par_encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
+                &compact_lwe_pk,
+                &mut output_compact_ct_list,
+                &input_plaintext_list,
+                glwe_noise_distribution,
+                glwe_noise_distribution,
+                &mut secret_random_generator,
+                &mut encryption_random_generator,
+            );
+
+            let mut output_plaintext_list = input_plaintext_list.clone();
+            output_plaintext_list.as_mut().fill(0u64);
+
+            let lwe_ciphertext_list = output_compact_ct_list.par_expand_into_lwe_ciphertext_list();
+
+            decrypt_lwe_ciphertext_list(&lwe_sk, &lwe_ciphertext_list, &mut output_plaintext_list);
+
+            let signed_decomposer =
+                SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
+
+            output_plaintext_list
+                .iter_mut()
+                .for_each(|x| *x.0 = signed_decomposer.closest_representable(*x.0));
+
+            assert_eq!(input_plaintext_list, output_plaintext_list);
+
+            lwe_ciphertext_list
+        };
+
+        let ser_lwe_ct_list = {
+            let mut deterministic_seeder =
+                DeterministicSeeder::<ActivatedRandomGenerator>::new(seed);
+            let mut secret_random_generator =
+                SecretRandomGenerator::<ActivatedRandomGenerator>::new(deterministic_seeder.seed());
+            let mut encryption_random_generator =
+                EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(
+                    deterministic_seeder.seed(),
+                    &mut deterministic_seeder,
+                );
+
+            let lwe_sk =
+                LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_random_generator);
+
+            let mut compact_lwe_pk =
+                LweCompactPublicKey::new(0u64, lwe_dimension, ciphertext_modulus);
+
+            generate_lwe_compact_public_key(
+                &lwe_sk,
+                &mut compact_lwe_pk,
+                glwe_noise_distribution,
+                &mut encryption_random_generator,
+            );
+
+            let mut output_compact_ct_list = LweCompactCiphertextList::new(
+                0u64,
+                lwe_dimension.to_lwe_size(),
+                lwe_ciphertext_count,
+                ciphertext_modulus,
+            );
+
+            encrypt_lwe_compact_ciphertext_list_with_compact_public_key(
+                &compact_lwe_pk,
+                &mut output_compact_ct_list,
+                &input_plaintext_list,
+                glwe_noise_distribution,
+                glwe_noise_distribution,
+                &mut secret_random_generator,
+                &mut encryption_random_generator,
+            );
+
+            let mut output_plaintext_list = input_plaintext_list.clone();
+            output_plaintext_list.as_mut().fill(0u64);
+
+            let lwe_ciphertext_list = output_compact_ct_list.expand_into_lwe_ciphertext_list();
+
+            decrypt_lwe_ciphertext_list(&lwe_sk, &lwe_ciphertext_list, &mut output_plaintext_list);
+
+            let signed_decomposer =
+                SignedDecomposer::new(DecompositionBaseLog(4), DecompositionLevelCount(1));
+
+            output_plaintext_list
+                .iter_mut()
+                .for_each(|x| *x.0 = signed_decomposer.closest_representable(*x.0));
+
+            assert_eq!(input_plaintext_list, output_plaintext_list);
+
+            lwe_ciphertext_list
+        };
+
+        assert_eq!(ser_lwe_ct_list, par_lwe_ct_list);
+    }
+}
--- a/tfhe/src/core_crypto/algorithms/test/lwe_keyswitch.rs
+++ b/tfhe/src/core_crypto/algorithms/test/lwe_keyswitch.rs
@@ -212,7 +212,7 @@ fn test_lwe_encrypt_ks_switch_mod_decrypt_custom_mod() {

        // In coverage, we break after one while loop iteration, changing message values does not
        // yield higher coverage
-        #[cfg(feature = "__coverage")]
+        #[cfg(tarpaulin)]
        break;
    }
 }
--- a/tfhe/src/core_crypto/algorithms/test/lwe_keyswitch_key_generation.rs
+++ b/tfhe/src/core_crypto/algorithms/test/lwe_keyswitch_key_generation.rs
@@ -12,10 +12,18 @@ fn test_seeded_lwe_ksk_gen_equivalence<Scalar: UnsignedTorus + Send + Sync>(
    // DISCLAIMER: these toy example parameters are not guaranteed to be secure or yield correct
    // computations
    // Define parameters for LweKeyswitchKey creation
+    #[cfg(not(tarpaulin))]
    let input_lwe_dimension = LweDimension(742);
+    #[cfg(tarpaulin)]
+    let input_lwe_dimension = LweDimension(1);
+
+    #[cfg(not(tarpaulin))]
+    let output_lwe_dimension = LweDimension(2048);
+    #[cfg(tarpaulin)]
+    let output_lwe_dimension = LweDimension(32);
+
    let lwe_noise_distribution =
        DynamicDistribution::new_gaussian_from_std_dev(StandardDev(0.000007069849454709433));
-    let output_lwe_dimension = LweDimension(2048);
    let decomp_base_log = DecompositionBaseLog(3);
    let decomp_level_count = DecompositionLevelCount(5);

--- a/tfhe/src/core_crypto/algorithms/test/lwe_multi_bit_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/algorithms/test/lwe_multi_bit_programmable_bootstrapping.rs
@@ -535,11 +535,13 @@ fn std_lwe_encrypt_multi_bit_deterministic_pbs_decrypt_custom_mod<Scalar>(
 #[test]
 pub fn test_lwe_encrypt_multi_bit_pbs_decrypt_factor_2_thread_5_native_mod() {
    lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_2_2_2_PARAMS);
+    lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_3_3_2_PARAMS);
 }

 #[test]
 pub fn test_lwe_encrypt_multi_bit_pbs_decrypt_factor_3_thread_12_native_mod() {
    lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_2_2_3_PARAMS);
+    lwe_encrypt_multi_bit_pbs_decrypt_custom_mod::<u64>(MULTI_BIT_3_3_3_PARAMS);
 }

 #[test]
--- a/tfhe/src/core_crypto/algorithms/test/lwe_packing_keyswitch_key_generation.rs
+++ b/tfhe/src/core_crypto/algorithms/test/lwe_packing_keyswitch_key_generation.rs
@@ -12,9 +12,17 @@ fn test_seeded_lwe_pksk_gen_equivalence<Scalar: UnsignedTorus>(
    // DISCLAIMER: these toy example parameters are not guaranteed to be secure or yield correct
    // computations
    // Define parameters for LweKeyswitchKey creation
+    #[cfg(not(tarpaulin))]
    let input_lwe_dimension = LweDimension(742);
-    let output_glwe_dimension = GlweDimension(1);
+    #[cfg(tarpaulin)]
+    let input_lwe_dimension = LweDimension(1);
+
+    #[cfg(not(tarpaulin))]
    let output_polynomial_size = PolynomialSize(2048);
+    #[cfg(tarpaulin)]
+    let output_polynomial_size = PolynomialSize(32);
+
+    let output_glwe_dimension = GlweDimension(1);
    let glwe_noise_distribution = DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
        0.00000000000000029403601535432533,
    ));
--- a/tfhe/src/core_crypto/algorithms/test/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/algorithms/test/lwe_programmable_bootstrapping.rs
@@ -398,6 +398,32 @@ pub const TEST_PARAMS_4_BITS_NATIVE_U128: ClassicTestParams<u128> = ClassicTestP
    ciphertext_modulus: CiphertextModulus::new_native(),
 };

+#[cfg(tarpaulin)]
+pub const COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U128: ClassicTestParams<u128> = ClassicTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(64),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        4.9982771e-11,
+    )),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        8.6457178e-32,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ks_level: DecompositionLevelCount(5),
+    ks_base_log: DecompositionBaseLog(3),
+    pfks_level: DecompositionLevelCount(1),
+    pfks_base_log: DecompositionBaseLog(23),
+    pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000029403601535432533,
+    )),
+    cbs_level: DecompositionLevelCount(0),
+    cbs_base_log: DecompositionBaseLog(0),
+    message_modulus_log: MessageModulusLog(4),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+};
+
 pub const TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestParams {
    lwe_dimension: LweDimension(742),
    glwe_dimension: GlweDimension(1),
@@ -423,6 +449,32 @@ pub const TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestPara
    ciphertext_modulus: CiphertextModulus::new(1 << 127),
 };

+#[cfg(tarpaulin)]
+pub const COVERAGE_TEST_PARAMS_3_BITS_127_U128: ClassicTestParams<u128> = ClassicTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(64),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        4.9982771e-11,
+    )),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        8.6457178e-32,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ks_level: DecompositionLevelCount(5),
+    ks_base_log: DecompositionBaseLog(3),
+    pfks_level: DecompositionLevelCount(1),
+    pfks_base_log: DecompositionBaseLog(23),
+    pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000029403601535432533,
+    )),
+    cbs_level: DecompositionLevelCount(0),
+    cbs_base_log: DecompositionBaseLog(0),
+    message_modulus_log: MessageModulusLog(3),
+    ciphertext_modulus: CiphertextModulus::new(1 << 127),
+};
+
 fn lwe_encrypt_pbs_f128_decrypt_custom_mod<Scalar>(params: ClassicTestParams<Scalar>)
 where
    Scalar: UnsignedTorus
@@ -537,9 +589,15 @@ where

 #[test]
 fn lwe_encrypt_pbs_f128_decrypt_custom_mod_test_params_4_bits_native_u128() {
+    #[cfg(not(tarpaulin))]
    lwe_encrypt_pbs_f128_decrypt_custom_mod(TEST_PARAMS_4_BITS_NATIVE_U128);
+    #[cfg(tarpaulin)]
+    lwe_encrypt_pbs_f128_decrypt_custom_mod(COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U128);
 }
 #[test]
 fn lwe_encrypt_pbs_f128_decrypt_custom_mod_test_params_3_bits_127_u128() {
+    #[cfg(not(tarpaulin))]
    lwe_encrypt_pbs_f128_decrypt_custom_mod(TEST_PARAMS_3_BITS_127_U128);
+    #[cfg(tarpaulin)]
+    lwe_encrypt_pbs_f128_decrypt_custom_mod(COVERAGE_TEST_PARAMS_3_BITS_127_U128);
 }
--- a/tfhe/src/core_crypto/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/algorithms/test/mod.rs
@@ -24,6 +24,7 @@ mod lwe_packing_keyswitch_key_generation;
 mod lwe_private_functional_packing_keyswitch;
 pub(crate) mod lwe_programmable_bootstrapping;
 mod modulus_switch_compression;
+#[cfg(not(tarpaulin))]
 mod noise_distribution;

 pub struct TestResources {
@@ -78,6 +79,32 @@ pub const TEST_PARAMS_4_BITS_NATIVE_U64: ClassicTestParams<u64> = ClassicTestPar
    ciphertext_modulus: CiphertextModulus::new_native(),
 };

+#[cfg(tarpaulin)]
+pub const COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64: ClassicTestParams<u64> = ClassicTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(32),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.000007069849454709433,
+    )),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000029403601535432533,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ks_level: DecompositionLevelCount(5),
+    ks_base_log: DecompositionBaseLog(3),
+    pfks_level: DecompositionLevelCount(1),
+    pfks_base_log: DecompositionBaseLog(23),
+    pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000029403601535432533,
+    )),
+    cbs_level: DecompositionLevelCount(0),
+    cbs_base_log: DecompositionBaseLog(0),
+    message_modulus_log: MessageModulusLog(4),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+};
+
 pub const TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams {
    lwe_dimension: LweDimension(742),
    glwe_dimension: GlweDimension(1),
@@ -103,6 +130,32 @@ pub const TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams
    ciphertext_modulus: CiphertextModulus::new(1 << 63),
 };

+#[cfg(tarpaulin)]
+pub const COVERAGE_TEST_PARAMS_3_BITS_63_U64: ClassicTestParams<u64> = ClassicTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(32),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.000007069849454709433,
+    )),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000029403601535432533,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ks_level: DecompositionLevelCount(5),
+    ks_base_log: DecompositionBaseLog(3),
+    pfks_level: DecompositionLevelCount(1),
+    pfks_base_log: DecompositionBaseLog(23),
+    pfks_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000029403601535432533,
+    )),
+    cbs_level: DecompositionLevelCount(0),
+    cbs_base_log: DecompositionBaseLog(0),
+    message_modulus_log: MessageModulusLog(3),
+    ciphertext_modulus: CiphertextModulus::new(1 << 63),
+};
+
 pub const TEST_PARAMS_3_BITS_SOLINAS_U64: ClassicTestParams<u64> = ClassicTestParams {
    lwe_dimension: LweDimension(742),
    glwe_dimension: GlweDimension(1),
@@ -196,6 +249,24 @@ pub const MULTI_BIT_2_2_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
    thread_count: ThreadCount(5),
 };

+pub const MULTI_BIT_3_3_2_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
+    input_lwe_dimension: LweDimension(922),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.0000003272369292345697,
+    )),
+    decomp_base_log: DecompositionBaseLog(14),
+    decomp_level_count: DecompositionLevelCount(2),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(8192),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.0000000000000000002168404344971009,
+    )),
+    message_modulus_log: MessageModulusLog(6),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+    grouping_factor: LweBskGroupingFactor(2),
+    thread_count: ThreadCount(5),
+};
+
 pub const MULTI_BIT_2_2_2_CUSTOM_MOD_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
    input_lwe_dimension: LweDimension(818),
    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
@@ -232,6 +303,24 @@ pub const MULTI_BIT_2_2_3_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
    thread_count: ThreadCount(12),
 };

+pub const MULTI_BIT_3_3_3_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
+    input_lwe_dimension: LweDimension(972),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000013016688349592805,
+    )),
+    decomp_base_log: DecompositionBaseLog(14),
+    decomp_level_count: DecompositionLevelCount(2),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(8192),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.0000000000000000002168404344971009,
+    )),
+    message_modulus_log: MessageModulusLog(6),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+    grouping_factor: LweBskGroupingFactor(3),
+    thread_count: ThreadCount(5),
+};
+
 pub const MULTI_BIT_2_2_3_CUSTOM_MOD_PARAMS: MultiBitTestParams<u64> = MultiBitTestParams {
    input_lwe_dimension: LweDimension(888),
    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
@@ -311,6 +400,70 @@ pub const FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
    ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
 };

+// DISCLAIMER: example parameters tailored for coverage tests. There are not guaranteed
+// to be secure or yield correct computations.
+#[cfg(tarpaulin)]
+pub const COVERAGE_FFT_U32_PARAMS: FftTestParams<u32> = FftTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(64),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000004998277131225527,
+    )),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000000000000000000008645717832544903,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+};
+
+#[cfg(tarpaulin)]
+pub const COVERAGE_FFT_U64_PARAMS: FftTestParams<u64> = FftTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(64),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000004998277131225527,
+    )),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000000000000000000008645717832544903,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+};
+
+#[cfg(tarpaulin)]
+pub const COVERAGE_FFT_U128_PARAMS: FftTestParams<u128> = FftTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(64),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000004998277131225527,
+    )),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000000000000000000008645717832544903,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ciphertext_modulus: CiphertextModulus::new_native(),
+};
+
+#[cfg(tarpaulin)]
+pub const COVERAGE_FFT128_U128_PARAMS: FftTestParams<u128> = FftTestParams {
+    lwe_dimension: LweDimension(1),
+    glwe_dimension: GlweDimension(1),
+    polynomial_size: PolynomialSize(64),
+    lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(0.12345)),
+    glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev(
+        0.00000000000000000000000000000008645717832544903,
+    )),
+    pbs_base_log: DecompositionBaseLog(23),
+    pbs_level: DecompositionLevelCount(1),
+    ciphertext_modulus: CiphertextModulus::<u128>::new_native(),
+};
+
 pub const FFT_WOPBS_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams {
    lwe_dimension: LweDimension(481),
    glwe_dimension: GlweDimension(1),
@@ -480,21 +633,33 @@ pub(crate) fn gen_keys_or_get_from_cache_if_enabled<

 // Macro to generate tests for all parameter sets
 macro_rules! create_parametrized_test{
-    ($name:ident { $($param:ident),*  $(,)? }) => {
+    (
+        $name:ident {
+            $($(#[$cfg:meta])* $param:ident),*
+            $(,)?
+        }
+    ) => {
        ::paste::paste! {
            $(
-            #[test]
-            fn [<test_ $name _ $param:lower>]() {
-                $name($param)
-            }
+                #[test]
+                $(#[$cfg])*
+                fn [<test_ $name _ $param:lower>]() {
+                    $name($param)
+                }
            )*
        }
    };
    ($name:ident)=> {
        create_parametrized_test!($name
        {
+            #[cfg(not(tarpaulin))]
            TEST_PARAMS_4_BITS_NATIVE_U64,
-            TEST_PARAMS_3_BITS_63_U64
+            #[cfg(not(tarpaulin))]
+            TEST_PARAMS_3_BITS_63_U64,
+            #[cfg(tarpaulin)]
+            COVERAGE_TEST_PARAMS_4_BITS_NATIVE_U64,
+            #[cfg(tarpaulin)]
+            COVERAGE_TEST_PARAMS_3_BITS_63_U64
        });
    };
 }
--- a/tfhe/src/core_crypto/algorithms/test/params.rs
+++ b/tfhe/src/core_crypto/algorithms/test/params.rs
@@ -9,7 +9,6 @@ pub struct ClassicBootstrapKeys<Scalar: UnsignedInteger> {
    pub small_lwe_sk: LweSecretKey<Vec<Scalar>>,
    pub big_lwe_sk: LweSecretKey<Vec<Scalar>>,
    pub bsk: LweBootstrapKeyOwned<Scalar>,
-
    pub fbsk: FourierLweBootstrapKeyOwned,
 }

--- a/tfhe/src/core_crypto/commons/generators/encryption/mod.rs
+++ b/tfhe/src/core_crypto/commons/generators/encryption/mod.rs
@@ -2,6 +2,7 @@

 pub(crate) mod mask_random_generator;
 pub(crate) mod noise_random_generator;
+#[cfg(not(tarpaulin))]
 #[cfg(test)]
 mod test;

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Testé	146aaf662a	chore(ci): spawn ec2 instance on-demand for code-coverage	2024-05-02 15:41:33 +02:00
David Testé	c28c8beb69	chore(ci): speed-up core_crypto code coverage execution duration	2024-05-02 15:40:09 +02:00
Nicolas Sarlin	08fd31f8fc	feat(gpu): add compatibility with cuda from package managers	2024-05-02 10:14:22 +02:00
Nicolas Sarlin	ba18519946	chore(ci): update toolchain to latest nightly	2024-04-30 16:50:18 +02:00
Beka Barbakadze	ab8a51bc0d	feat(gpu): implement div_rem for cuda GPU	2024-04-30 15:53:02 +02:00
Arthur Meyre	9ecb9f416e	chore(doc): fix broken reference	2024-04-30 11:25:06 +02:00
David Testé	0618b9422c	chore(ci): separate pcc and tests for gpu to allow faster merge	2024-04-30 08:51:46 +02:00
David Testé	5bb3af30b6	chore(bench): fix naming format for signed scalar operations	2024-04-30 08:51:29 +02:00
aquint-zama	ef1607db51	chore(doc): fix broken link	2024-04-29 14:35:18 +02:00
Arthur Meyre	d2daf449b7	chore(core): fix the stair KS test for more generic stair KS parameters - the actual large LWE dimension to consider is the partial fill as those are the non zero coefficients on which the shared coefficients are then computed/based on - the 4 bits parameters don't exhibit the bug as they are not on the GLWE noise plateau	2024-04-29 13:23:46 +02:00
dependabot[bot]	826c80ea6d	chore(deps): bump actions/upload-artifact from 4.3.2 to 4.3.3 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.2 to 4.3.3. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](`1746f4ab65...65462800fd`) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-29 13:23:12 +02:00
dependabot[bot]	bc991e1946	chore(deps): bump actions/checkout from 4.1.3 to 4.1.4 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.3 to 4.1.4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](`1d96c772d1...0ad4b8fada`) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-29 13:23:06 +02:00
Pedro Alves	f88e309503	feat(gpu): Implements a classical PBS variant that uses thread block cluster and distributed shared memory	2024-04-29 09:42:58 +02:00
Pedro Alves	cbc23c08a2	feat(gpu): Implements a multi-bit PBS variant that uses thread block cluster and distributed shared memory	2024-04-29 09:42:58 +02:00
Pedro Alves	e9ee813911	fix(gpu): fix out-of-memory error in the custom benchmark tool	2024-04-29 09:42:58 +02:00
Pedro Alves	3ebc114c85	fix(gpu): implements a NOSM mode to tree_add_chunks()	2024-04-29 09:42:58 +02:00
Pedro Alves	effe6a9ec8	chore(gpu): remove message_3_carry_3 parameters from scalar/encrypted comparison tests	2024-04-29 09:42:58 +02:00
Pedro Alves	a8fe4d9f38	fix(gpu): fix multi-bit PBS when pbs_level > 1	2024-04-29 09:42:58 +02:00
Pedro Alves	af7c1e14a4	fix(gpu): refactor PBS' host functions to run using the same variant as the scratch function	2024-04-29 09:42:58 +02:00
Agnes Leroy	85fbca62a2	chore(gpu): extend integer multiplication to N other than 2048	2024-04-29 09:42:58 +02:00
Agnes Leroy	9c36ee477b	chore(gpu): fix params in GPU integer benchmarks	2024-04-26 13:00:01 -03:00
Pedro Alves	9d70e695b5	chore(gpu): automatically compute the best lwe_chunk_size	2024-04-26 11:58:19 -03:00
Pedro Alves	33e7cfea2d	chore(gpu): move luts_message_carry allocation to host_integer_sum_ciphertexts_vec_kb to reduce memory consumption	2024-04-26 09:20:39 -03:00
Daniel Demmler	ce0aea171b	fix(zk): correct byte indices for uncompressed serializtion	2024-04-25 18:06:54 +02:00
Kelong Cong	083c068144	fix(zk): expose compute_crs_params since we need big_d and b_r too Knowing these two values in addition would allow us to use PublicParams::from_vec.	2024-04-25 16:45:11 +02:00
Kelong Cong	8912e887e3	feat(zk): open up the API to output the CRS witness dimension The motivation of this PR is that sometimes the CRS is generated by a ceremony, instead of centrally. So it is useful to have some function that computes the CRS witness dimension without generating the CRS centrally.	2024-04-25 16:45:11 +02:00
tmontaigu	ddd0f2ce79	chore(integer): reduce nb tests for big params	2024-04-25 13:53:58 +02:00
tmontaigu	aeab9e851b	fix(integer): fix match_value	2024-04-25 13:53:58 +02:00
Mayeul@Zama	919f07f911	fix(shortint): is_functional_bivariate_pbs_possible takes an optional lut	2024-04-24 18:55:37 +02:00
tmontaigu	6199e04ce0	feat(integer): add vector match_value/index_of/contains/is_in	2024-04-23 10:55:36 +02:00
dependabot[bot]	77a6673d59	chore(deps): bump dtolnay/rust-toolchain Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from 7164405e8653277d57afd42ba081b5aa02a70396 to bb45937a053e097f8591208d8e74c90db1873d07. - [Release notes](https://github.com/dtolnay/rust-toolchain/releases) - [Commits](`7164405e86...bb45937a05`) --- updated-dependencies: - dependency-name: dtolnay/rust-toolchain dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-22 13:01:57 +02:00
dependabot[bot]	631dd09445	chore(deps): bump tj-actions/changed-files from 44.0.1 to 44.3.0 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 44.0.1 to 44.3.0. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`635f118699...0874344d6e`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-22 09:20:46 +02:00
dependabot[bot]	a7f6ea12ce	chore(deps): bump actions/upload-artifact from 4.3.1 to 4.3.2 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.1 to 4.3.2. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](`5d5d22a312...1746f4ab65`) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-22 09:20:21 +02:00
dependabot[bot]	0a1cba43c4	chore(deps): bump actions/checkout from 4.1.2 to 4.1.3 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.2 to 4.1.3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](`9bb56186c3...1d96c772d1`) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-22 09:20:12 +02:00
Agnes Leroy	666926e4b0	chore(gpu): add tests for N=8192 multi-bit PBS	2024-04-19 17:02:55 +02:00
Agnes Leroy	105e2488f6	chore(gpu): add casts and signed scalar comparisons to the hl api	2024-04-18 18:20:24 +02:00
Agnes Leroy	cf62937632	feat(gpu): cast signed/unsigned and signed/signed	2024-04-18 09:11:53 +02:00
Arthur Meyre	c0985707ad	chore(doc): fix API calls	2024-04-15 18:56:00 +02:00
Arthur Meyre	2cc20a6bd5	chore(ci): fix sage script f-string missing f	2024-04-15 18:55:06 +02:00
Agnes Leroy	28db75b3a3	fix(gpu): fix typo in doc	2024-04-15 17:06:49 +02:00
dependabot[bot]	6f31bffef6	chore(deps): bump codecov/codecov-action from 4.2.0 to 4.3.0 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 4.2.0 to 4.3.0. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`7afa10ed9b...84508663e9`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-15 13:15:10 +02:00
dependabot[bot]	9e6b251e68	chore(deps): bump dtolnay/rust-toolchain Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from dc6353516c68da0f06325f42ad880f76a5e77ec9 to 7164405e8653277d57afd42ba081b5aa02a70396. - [Release notes](https://github.com/dtolnay/rust-toolchain/releases) - [Commits](`dc6353516c...7164405e86`) --- updated-dependencies: - dependency-name: dtolnay/rust-toolchain dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-15 13:14:42 +02:00
dependabot[bot]	da46d26f1b	chore(deps): bump tj-actions/changed-files from 44.0.0 to 44.0.1 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 44.0.0 to 44.0.1. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`2d756ea4c5...635f118699`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-04-15 12:24:24 +02:00
Mayeul@Zama	b4662607bf	style(core): remove newline	2024-04-15 09:29:05 +02:00
Mayeul@Zama	faac5e10e6	style(c_api): fix typo	2024-04-15 09:29:05 +02:00
Mayeul@Zama	0fa5231e5a	style(core): remove allow(clippy::used_underscore_binding)	2024-04-15 09:29:05 +02:00
Arthur Meyre	9f36cf7120	chore(ci): make a github runner remove a label	2024-04-12 09:54:45 +02:00
Arthur Meyre	a654a47136	chore(c): change include <tfhe.h> to "tfhe.h" - as tfhe is not a system library C/C++ practice is to use the "" style	2024-04-12 09:54:37 +02:00
Arthur Meyre	29dfac320c	chore(c_api): fix build on M1 mac	2024-04-11 09:23:40 +02:00
Arthur Meyre	af11862dc1	chore(ci): add zk feature for deterministic FFT C API	2024-04-11 09:23:40 +02:00
Arthur Meyre	1b5745c22f	chore(ci): add missing install for clippy_task	2024-04-11 09:23:40 +02:00
Arthur Meyre	e6970fc00d	chore(tfhe): update serialization version	2024-04-11 09:23:20 +02:00
Arthur Meyre	cc4b7dc33e	refactor(shortint): add max_noise_level field to CompressedServerKey	2024-04-11 09:23:20 +02:00
Agnes Leroy	5417ba53be	fix(gpu): add edge test for scalar comparisons	2024-04-10 06:43:17 -03:00