pbs count

chore(tfhe): remove last remaining modular_std_dev
- some places were not updated, remove the last non modular std_dev - the ones to dump parameters are modular so are kept
2026-01-11 07:38:08 -05:00 · 2024-03-20 18:28:02 +01:00 · 2024-03-12 11:12:40 +01:00 · 2024-03-12 09:57:39 +01:00 · 2024-03-12 09:57:39 +01:00 · 2024-03-11 14:49:39 +01:00
284 changed files with 20278 additions and 10141 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -0,0 +1,9 @@
+self-hosted-runner:
+  # Labels of self-hosted runner in array of strings.
+  labels:
+    - m1mac
+    - 4090-desktop
+# Configuration variables in array of strings defined in your repository or
+# organization. `null` means disabling configuration variables check.
+# Empty array means no configuration variable is allowed.
+config-variables: null
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -6,6 +6,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/aws_tfhe_gpu_4090_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_4090_tests.yml
@@ -6,6 +6,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -41,9 +42,9 @@ jobs:

      - name: Run clippy checks
        run: |
-          make clippy_gpu
+          make pcc_gpu

-      - name: Run all tests
+      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu

@@ -55,6 +56,10 @@ jobs:
        run: |
          make test_c_api_gpu

+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu
+
      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
        if: ${{ github.event_name == 'pull_request' }}
        with:
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -6,6 +6,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -78,10 +79,12 @@ jobs:
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"

      - name: Run fmt checks
        run: |
@@ -89,9 +92,9 @@ jobs:

      - name: Run clippy checks
        run: |
-          make clippy_gpu
+          make pcc_gpu

-      - name: Run all tests
+      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu

@@ -103,6 +106,9 @@ jobs:
        run: |
          make test_c_api_gpu

+      - name: Run High Level API Tests
+        run: |
+          make test_high_level_api_gpu

      - name: Slack Notification
        if: ${{ always() }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -5,6 +5,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -5,6 +5,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -5,6 +5,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -5,6 +5,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -33,6 +33,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-boolean-benchmarks:
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -7,6 +7,7 @@ env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -67,5 +68,9 @@ jobs:
        run: |
          make build_c_api

+      - name: Build coverage tests
+        run: |
+          make build_tfhe_coverage
+
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -0,0 +1,27 @@
+# Lint and check CI
+name: CI Lint and Checks
+
+on:
+  pull_request:
+
+env:
+  ACTIONLINT_VERSION: 1.6.27
+
+jobs:
+  lint-check:
+    name: Lint and checks
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+
+      - name: Get actionlint
+        run: |
+          bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
+          echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782  actionlint" > checksum
+          sha256sum -c checksum
+          ln -s "$(pwd)/actionlint" /usr/local/bin/
+
+      - name: Lint workflows
+        run: |
+          make lint_workflow
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -5,6 +5,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -39,7 +40,7 @@ jobs:
      group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
      cancel-in-progress: true
    runs-on: ${{ inputs.runner_name }}
-    timeout-minutes: 1080
+    timeout-minutes: 11520 # 8 days
    steps:
      # Step used for log purpose.
      - name: Instance configuration used
@@ -68,7 +69,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ec75ae5ab7296b81fd4cddb77294d6718932ebab
+        uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
        with:
          files_yaml: |
            tfhe:
@@ -98,7 +99,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@e0b68c6749509c5f83f984dd99a76a1c1a231044
+        uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -106,6 +107,20 @@ jobs:
          fail_ci_if_error: true
          files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml

+      - name: Run integer coverage
+        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+        run: |
+          make test_integer_cov
+
+      - name: Upload tfhe coverage to Codecov
+        uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
+        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          directory: ./coverage/
+          fail_ci_if_error: true
+          files: integer/cobertura.xml
+
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
--- a/.github/workflows/core_crypto_benchmark.yml
+++ b/.github/workflows/core_crypto_benchmark.yml
@@ -33,6 +33,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-core-crypto-benchmarks:
--- a/.github/workflows/core_crypto_gpu_benchmark.yml
+++ b/.github/workflows/core_crypto_gpu_benchmark.yml
@@ -78,19 +78,23 @@ jobs:
      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"

      - name: Run benchmarks with AVX512
        run: |
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -5,6 +5,7 @@ env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/gpu_4090_full_benchmark.yml
+++ b/.github/workflows/gpu_4090_full_benchmark.yml
@@ -0,0 +1,202 @@
+# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
+name: TFHE Cuda Backend - 4090 full benchmarks
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  pull_request:
+    types: [labeled]
+  schedule:
+    # Weekly benchmarks will be triggered each Friday at 9p.m.
+    - cron: "0 21 * * 5"
+
+jobs:
+  cuda-integer-benchmarks:
+    name: Cuda integer benchmarks for all operations flavor  (RTX 4090)
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
+      cancel-in-progress: true
+    runs-on: ["self-hosted", "4090-desktop"]
+    timeout-minutes: 1440 # 24 hours
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        command: [integer, integer_multi_bit]
+        op_flavor: [default, unchecked]
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Run integer benchmarks
+        run: |
+          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "rtx4090" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ always() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+
+  cuda-core-crypto-benchmarks:
+    name: Cuda core crypto benchmarks  (RTX 4090)
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
+    needs: cuda-integer-benchmarks
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
+      cancel-in-progress: true
+    runs-on: ["self-hosted", "4090-desktop"]
+    timeout-minutes: 1440 # 24 hours
+
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Run integer benchmarks
+        run: |
+          make bench_pbs_gpu
+          make bench_ks_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "rtx4090" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
+        with:
+          name: ${{ github.sha }}_core_crypto
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ !success() && !cancelled() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+
+  remove_github_label:
+    name: Remove 4090 bench label
+    if: ${{ github.event_name == 'pull_request' }}
+    needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
+    runs-on: ["self-hosted", "4090-desktop"]
+    steps:
+      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
+        with:
+          labels: 4090_bench
+          github_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -26,6 +26,7 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -29,6 +29,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  prepare-matrix:
@@ -40,17 +41,17 @@ jobs:
      - name: Weekly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"

      - name: Quarterly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"

      -  name: Set operation flavor output
         id: set_op_flavor
         run: |
-          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> ${GITHUB_OUTPUT}
+          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

  integer-benchmarks:
    name: Execute integer benchmarks for all operations flavor
@@ -79,9 +80,11 @@ jobs:

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
--- a/.github/workflows/integer_gpu_4090_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_4090_full_benchmark.yml
@@ -1,110 +0,0 @@
-# Run all integer benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
-name: TFHE Cuda Backend - 4090 Integer full benchmarks
-
-env:
-  CARGO_TERM_COLOR: always
-  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUST_BACKTRACE: "full"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [labeled]
-  schedule:
-    # Weekly benchmarks will be triggered each Friday at 9p.m.
-    - cron: '0 21 * * 5'
-
-jobs:
-  cuda-integer-benchmarks:
-    name: Cuda integer benchmarks for all operations flavor  (RTX 4090)
-    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
-    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
-      cancel-in-progress: true
-    runs-on: ["self-hosted", "4090-desktop"]
-    timeout-minutes: 1440  # 24 hours
-    strategy:
-      fail-fast: false
-      max-parallel: 1
-      matrix:
-        command: [ integer, integer_multi_bit]
-        op_flavor: [ default, unchecked ]
-
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-
-      - name: Get benchmark details
-        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
-
-      - name: Install rust
-        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
-        with:
-          toolchain: nightly
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
-
-      - name: Run benchmarks with AVX512
-        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
-
-      - name: Parse results
-        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
-          --database tfhe_rs \
-          --hardware "rtx4090"
-          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --walk-subdirs \
-          --throughput
-
-      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
-        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
-          path: ${{ env.RESULTS_FILENAME }}
-
-      - name: Send data to Slab
-        shell: bash
-        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
-
-      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        if: ${{ github.event_name == 'pull_request' }}
-        with:
-          labels: 4090_bench
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Slack Notification
-        if: ${{ always() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -26,6 +26,7 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
@@ -72,19 +73,22 @@ jobs:
      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"

      - name: Run benchmarks with AVX512
        run: |
@@ -145,7 +149,7 @@ jobs:
          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ !success() && !cancelled() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -33,19 +33,21 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  integer-benchmarks:
    name: Execute integer benchmarks for all operations flavor
    runs-on: ${{ github.event.inputs.runner_name }}
+    timeout-minutes: 1440 # 24 hours
    if: ${{ !cancelled() }}
    continue-on-error: true
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
-        command: [ integer, integer_multi_bit]
-        op_flavor: [ default, unchecked ]
+        command: [integer, integer_multi_bit]
+        op_flavor: [default, unchecked]
        # explicit include-based build matrix, of known valid options
        include:
          - os: ubuntu-22.04
@@ -68,9 +70,11 @@ jobs:

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
@@ -85,19 +89,22 @@ jobs:
      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"

      - name: Checkout Slab repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -147,7 +154,7 @@ jobs:
  slack-notification:
    name: Slack Notification
    runs-on: ${{ github.event.inputs.runner_name }}
-    if: ${{ failure() }}
+    if: ${{ !success() && !cancelled() }}
    needs: integer-benchmarks
    steps:
      - name: Notify
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -26,6 +26,7 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -1,5 +1,5 @@
 # Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
-name: Integer Multi-bit benchmarks
+name: Integer GPU Multi-bit benchmarks

 on:
  workflow_dispatch:
@@ -26,11 +26,13 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
-  run-integer-benchmarks:
+  cuda-integer-benchmarks:
    name: Execute integer multi-bit benchmarks in EC2
    runs-on: ${{ github.event.inputs.runner_name }}
+    timeout-minutes: 1440 # 24 hours
    if: ${{ !cancelled() }}
    strategy:
      fail-fast: false
@@ -38,8 +40,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "11.8"
-            cuda_arch: "70"
+            cuda: "12.2"
            gcc: 9
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
@@ -73,19 +74,22 @@ jobs:
      - name: Export CUDA variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
-          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"

      - name: Run multi-bit benchmarks with AVX512
        run: |
@@ -146,7 +150,7 @@ jobs:
          ${{ secrets.SLAB_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ !success() && !cancelled() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
        env:
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -15,6 +15,7 @@ env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  FAST_TESTS: "TRUE"

--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -24,6 +24,7 @@ jobs:
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
+          ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'

      - name: Install Sage
        run: |
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-shortint-benchmarks:
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -33,6 +33,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  shortint-benchmarks:
@@ -58,9 +59,11 @@ jobs:

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -26,6 +26,7 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -29,6 +29,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  integer-benchmarks:
@@ -57,9 +58,11 @@ jobs:

      - name: Get benchmark details
        run: |
-          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
-          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
-          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"

      - name: Set up home
        # "Install rust" step require root user to have a HOME directory which is not set.
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -26,6 +26,7 @@ env:
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-integer-benchmarks:
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -64,7 +64,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@ec75ae5ab7296b81fd4cddb77294d6718932ebab
+        uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
        with:
          files_yaml: |
            common_benches:
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -33,6 +33,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"

 jobs:
  run-wasm-client-benchmarks:
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,6 @@ dieharder_run.log

 # Coverage reports
 /coverage/
+
+# Cuda local build
+backends/tfhe-cuda-backend/cuda/cmake-build-debug/
--- a/70
+++ b/70
@@ -61,7 +61,7 @@ REGEX_STRING?=''
 REGEX_PATTERN?=''

 # tfhe-cuda-backend
-TFHECUDA_SRC="backends/tfhe-cuda-backend/cuda"
+TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

 # Exclude these files from coverage reports
@@ -144,6 +144,11 @@ check_linelint_installed:
 	@printf "\n" | linelint - > /dev/null 2>&1 || \
 	( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )

+.PHONY: check_actionlint_installed # Check if actionlint workflow linter is installed
+check_actionlint_installed:
+	@actionlint --version > /dev/null 2>&1 || \
+	( echo "Unable to locate actionlint. Try installing it: https://github.com/rhysd/actionlint/releases" && exit 1 )
+
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
@@ -163,7 +168,7 @@ check_fmt_gpu: install_rs_check_toolchain
 	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c

 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
-clippy_gpu: install_rs_check_toolchain clippy_cuda_backend
+clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
 		--all-targets \
@@ -177,6 +182,10 @@ fix_newline: check_linelint_installed
 check_newline: check_linelint_installed
 	linelint .

+.PHONY: lint_workflow # Run static linter on GitHub workflows
+lint_workflow: check_actionlint_installed
+	actionlint
+
 .PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -301,6 +310,11 @@ build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets

+.PHONY: build_tfhe_coverage # Build with test coverage enabled
+build_tfhe_coverage: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
+
 .PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
 symlink_c_libs_without_fingerprint:
 	@./scripts/symlink_c_libs_without_fingerprint.sh \
@@ -370,18 +384,26 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
 		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage \
+		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
 		-p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
 			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage,$(AVX512_FEATURE) \
-			-p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
+			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
 	fi

+.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
+test_cuda_backend:
+	mkdir -p "$(TFHECUDA_BUILD)" && \
+		cd "$(TFHECUDA_BUILD)" && \
+		cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
+		make -j && \
+		make test
+
 .PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
-test_gpu: test_core_crypto_gpu test_integer_gpu
+test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend

 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
@@ -407,8 +429,8 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
-		-p $(TFHE_SPEC) -- boolean::
+		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
+		-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::

 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
@@ -452,8 +474,8 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
-		-p $(TFHE_SPEC) -- shortint::
+		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
+		-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::

 .PHONY: test_integer_ci # Run the tests for integer ci
 test_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -513,12 +535,26 @@ test_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::

+.PHONY: test_integer_cov # Run the tests of the integer module with code coverage
+test_integer_cov: install_rs_check_toolchain install_tarpaulin
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
+		--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
+		--implicit-test-threads \
+		--exclude-files $(COVERAGE_EXCLUDED_FILES) \
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
+		-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
+
 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
 		-- high_level_api::

+test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*gpu.*/)"
+
 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
@@ -605,6 +641,16 @@ check_compile_tests:
 		./scripts/c_api_tests.sh --build-only; \
 	fi

+.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
+check_compile_tests_benches_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
+		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
+		-p $(TFHE_SPEC)
+	mkdir -p "$(TFHECUDA_BUILD)" && \
+		cd "$(TFHECUDA_BUILD)" && \
+		cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
+		make -j
+
 .PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
 build_nodejs_test_docker:
 	DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
@@ -771,7 +817,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
 #
 .PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
 gen_key_cache: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
+	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 		--example generates_test_keys \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
 		$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
@@ -847,7 +893,7 @@ sha256_bool: install_rs_check_toolchain
 pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
-pcc_gpu: pcc clippy_gpu
+pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"]
 ```

 > [!Note]
-> Note: You need to use a Rust version >= 1.72 to compile TFHE-rs.
+> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.

 > [!Note]
 > Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.1.2"
+version = "0.2.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/cuda/.gitignore
+++ b/backends/tfhe-cuda-backend/cuda/.gitignore
@@ -0,0 +1 @@
+/build/
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -71,10 +71,13 @@ set(CMAKE_CUDA_FLAGS
 set(INCLUDE_DIR include)

 add_subdirectory(src)
+enable_testing()
+add_subdirectory(tests_and_benchmarks)
 target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})

 # This is required for rust cargo build
 install(TARGETS tfhe_cuda_backend DESTINATION .)
+
 install(TARGETS tfhe_cuda_backend DESTINATION lib)

 # Define a function to add a lint target.
@@ -86,5 +89,3 @@ if(CPPLINT)
  set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
  # set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
 endif()
-
-enable_testing()
--- a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
+++ b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
@@ -6,14 +6,14 @@ while getopts ":c" option; do
  case $option in
    c)
      # code to execute when flag1 is provided
-      find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
+      find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
      cmake-format -i CMakeLists.txt -c .cmake-format-config.py
-      find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
+      find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
      git diff --exit-code
      exit
      ;;
  esac
 done
-find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
+find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
 cmake-format -i CMakeLists.txt -c .cmake-format-config.py
-find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
+find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
@@ -5,6 +5,7 @@
 #include <cstdint>

 enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
+enum PBS_VARIANT { DEFAULT = 0, FAST = 1 };

 extern "C" {
 void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
@@ -54,13 +55,13 @@ void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
                                      int8_t **pbs_buffer);

 void scratch_cuda_bootstrap_low_latency_32(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);

 void scratch_cuda_bootstrap_low_latency_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory);
@@ -68,7 +69,7 @@ void scratch_cuda_bootstrap_low_latency_64(
 void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
@@ -76,13 +77,16 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
 void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

-void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
-                                        int8_t **pbs_buffer);
+void cleanup_cuda_bootstrap_low_latency_32(cuda_stream_t *stream,
+                                           int8_t **pbs_buffer);
+
+void cleanup_cuda_bootstrap_low_latency_64(cuda_stream_t *stream,
+                                           int8_t **pbs_buffer);

 uint64_t get_buffer_size_bootstrap_amortized_64(
    uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -93,6 +97,212 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
 }

+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_bootstrap_low_latency_step_one(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_bootstrap_low_latency_step_two(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_bootstrap_fast_low_latency(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
+}
+
+template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
+
+template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
+  int8_t *d_mem;
+
+  Torus *global_accumulator;
+  double2 *global_accumulator_fft;
+
+  PBS_VARIANT pbs_variant;
+
+  pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
+             uint32_t polynomial_size, uint32_t level_count,
+             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
+             bool allocate_gpu_memory) {
+    this->pbs_variant = pbs_variant;
+
+    auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
+
+    if (allocate_gpu_memory) {
+      switch (pbs_variant) {
+      case PBS_VARIANT::DEFAULT: {
+        uint64_t full_sm_step_one =
+            get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
+                polynomial_size);
+        uint64_t full_sm_step_two =
+            get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(
+                polynomial_size);
+
+        uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+        uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+        uint64_t full_dm = full_sm_step_one;
+
+        uint64_t device_mem = 0;
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_two) {
+          device_mem =
+              (partial_dm_step_two + partial_dm_step_one * level_count) *
+              input_lwe_ciphertext_count * (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm_step_one) {
+          device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                       level_count * (glwe_dimension + 1);
+        }
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
+
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                (polynomial_size / 2) * sizeof(double2),
+            stream);
+
+        global_accumulator = (Torus *)cuda_malloc_async(
+            (glwe_dimension + 1) * input_lwe_ciphertext_count *
+                polynomial_size * sizeof(Torus),
+            stream);
+      } break;
+      case PBS_VARIANT::FAST: {
+        uint64_t full_sm =
+            get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
+                polynomial_size);
+        uint64_t partial_sm =
+            get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
+                polynomial_size);
+
+        uint64_t partial_dm = full_sm - partial_sm;
+        uint64_t full_dm = full_sm;
+        uint64_t device_mem = 0;
+
+        if (max_shared_memory < partial_sm) {
+          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        } else if (max_shared_memory < full_sm) {
+          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                       (glwe_dimension + 1);
+        }
+
+        // Otherwise, both kernels run all in shared memory
+        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
+
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+                polynomial_size / 2 * sizeof(double2),
+            stream);
+      } break;
+      default:
+        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      }
+    }
+  }
+
+  void release(cuda_stream_t *stream) {
+    cuda_drop_async(d_mem, stream);
+    cuda_drop_async(global_accumulator_fft, stream);
+
+    if (pbs_variant == DEFAULT)
+      cuda_drop_async(global_accumulator, stream);
+  }
+};
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
+      polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
+          polynomial_size);
+  uint64_t partial_dm = full_sm - partial_sm;
+  uint64_t full_dm = full_sm;
+  uint64_t device_mem = 0;
+  if (max_shared_memory < partial_sm) {
+    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
+  } else if (max_shared_memory < full_sm) {
+    device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
+  }
+  uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
+                                          input_lwe_ciphertext_count *
+                                          polynomial_size / 2 * sizeof(double2);
+  return buffer_size + buffer_size % sizeof(double2);
+}
+
+template <typename Torus>
+bool has_support_to_cuda_bootstrap_fast_low_latency(uint32_t glwe_dimension,
+                                                    uint32_t polynomial_size,
+                                                    uint32_t level_count,
+                                                    uint32_t num_samples,
+                                                    uint32_t max_shared_memory);
+
+template <typename Torus>
+void cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+
+template <typename Torus>
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_fast_bootstrap_low_latency(
+    cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **pbs_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_bootstrap_low_latency(
+    cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
                                         int glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
@@ -1,23 +1,22 @@
 #ifndef CUDA_MULTI_BIT_H
 #define CUDA_MULTI_BIT_H

+#include "bootstrap.h"
 #include <cstdint>

 extern "C" {
+
+bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
+                                                  uint32_t polynomial_size,
+                                                  uint32_t level_count,
+                                                  uint32_t num_samples,
+                                                  uint32_t max_shared_memory);
+
 void cuda_convert_lwe_multi_bit_bootstrap_key_64(
    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
    uint32_t grouping_factor);

-void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t chunk_size = 0);
-
 void scratch_cuda_multi_bit_pbs_64(
    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -25,8 +24,118 @@ void scratch_cuda_multi_bit_pbs_64(
    uint32_t max_shared_memory, bool allocate_gpu_memory,
    uint32_t chunk_size = 0);

-void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
+void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+
+void scratch_cuda_generic_multi_bit_pbs_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size = 0);
+
+void cuda_generic_multi_bit_pbs_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+
+void cleanup_cuda_multi_bit_pbs_32(cuda_stream_t *stream, int8_t **pbs_buffer);
+void cleanup_cuda_multi_bit_pbs_64(cuda_stream_t *stream, int8_t **pbs_buffer);
 }
+
+template <typename Torus, typename STorus>
+void scratch_cuda_fast_multi_bit_pbs(
+    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+
+template <typename Torus>
+void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0);
+
+template <typename Torus, typename STorus>
+void scratch_cuda_multi_bit_pbs(
+    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+
+template <typename Torus>
+void cuda_multi_bit_pbs_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0);
+
+template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
+  double2 *keybundle_fft;
+  Torus *global_accumulator;
+  double2 *global_accumulator_fft;
+
+  PBS_VARIANT pbs_variant;
+
+  pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
+             uint32_t polynomial_size, uint32_t level_count,
+             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
+             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
+    this->pbs_variant = pbs_variant;
+    auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
+
+    if (allocate_gpu_memory) {
+      switch (pbs_variant) {
+      case DEFAULT:
+      case FAST:
+        keybundle_fft = (double2 *)cuda_malloc_async(
+            input_lwe_ciphertext_count * lwe_chunk_size * level_count *
+                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                (polynomial_size / 2) * sizeof(double2),
+            stream);
+        global_accumulator = (Torus *)cuda_malloc_async(
+            input_lwe_ciphertext_count * (glwe_dimension + 1) *
+                polynomial_size * sizeof(Torus),
+            stream);
+        global_accumulator_fft = (double2 *)cuda_malloc_async(
+            input_lwe_ciphertext_count * (glwe_dimension + 1) * level_count *
+                (polynomial_size / 2) * sizeof(double2),
+            stream);
+        break;
+      default:
+        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      }
+    }
+  }
+
+  void release(cuda_stream_t *stream) {
+    cuda_drop_async(keybundle_fft, stream);
+    cuda_drop_async(global_accumulator, stream);
+    cuda_drop_async(global_accumulator_fft, stream);
+  }
+};
+
 #ifdef __CUDACC__
 __host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
                                     uint32_t level_count,
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -80,6 +80,12 @@ void cuda_drop_async(void *ptr, cuda_stream_t *stream);
 int cuda_get_max_shared_memory(uint32_t gpu_index);

 void cuda_synchronize_stream(cuda_stream_t *stream);
+
+void cuda_stream_add_callback(cuda_stream_t *stream,
+                              cudaStreamCallback_t callback, void *user_data);
+
+void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
+                                  void *host_pointer);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -274,7 +274,7 @@ template <typename Torus> struct int_radix_lut {
  uint32_t num_blocks;
  bool mem_reuse = false;

-  int8_t *pbs_buffer;
+  int8_t *buffer;

  Torus *lut_indexes;
  Torus *lwe_indexes;
@@ -299,7 +299,7 @@ template <typename Torus> struct int_radix_lut {

    ///////////////
    execute_scratch_pbs<Torus>(
-        stream, &pbs_buffer, params.glwe_dimension, params.small_lwe_dimension,
+        stream, &buffer, params.glwe_dimension, params.small_lwe_dimension,
        params.polynomial_size, params.pbs_level, params.grouping_factor,
        num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
        params.pbs_type, allocate_gpu_memory);
@@ -338,7 +338,7 @@ template <typename Torus> struct int_radix_lut {
  // constructor to reuse memory
  int_radix_lut(cuda_stream_t *stream, int_radix_params params,
                uint32_t num_luts, uint32_t num_radix_blocks,
-                int_radix_lut<Torus> *base_lut_object) {
+                int_radix_lut *base_lut_object) {
    this->params = params;
    this->num_blocks = num_radix_blocks;
    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
@@ -348,7 +348,7 @@ template <typename Torus> struct int_radix_lut {
    // base lut object should have bigger or equal memory than current one
    assert(num_radix_blocks <= base_lut_object->num_blocks);
    // pbs
-    pbs_buffer = base_lut_object->pbs_buffer;
+    buffer = base_lut_object->buffer;
    // Keyswitch
    tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
    tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
@@ -392,7 +392,41 @@ template <typename Torus> struct int_radix_lut {
    cuda_drop_async(lwe_indexes, stream);
    cuda_drop_async(lut, stream);
    if (!mem_reuse) {
-      cuda_drop_async(pbs_buffer, stream);
+      switch (params.pbs_type) {
+      case MULTI_BIT:
+        switch (sizeof(Torus)) {
+        case sizeof(uint32_t):
+          cleanup_cuda_multi_bit_pbs_32(stream, &buffer);
+          break;
+        case sizeof(uint64_t):
+          cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
+          break;
+        default:
+          PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
+                "integer "
+                "moduli are supported.")
+        }
+        break;
+      case LOW_LAT:
+        switch (sizeof(Torus)) {
+        case sizeof(uint32_t):
+          cleanup_cuda_bootstrap_low_latency_32(stream, &buffer);
+          break;
+        case sizeof(uint64_t):
+          cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
+          break;
+        default:
+          PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
+                "integer "
+                "moduli are supported.")
+        }
+        break;
+      case AMORTIZED:
+        cleanup_cuda_bootstrap_amortized(stream, &buffer);
+        break;
+      default:
+        PANIC("Cuda error (PBS): unknown PBS type. ")
+      }
      cuda_drop_async(tmp_lwe_before_ks, stream);
      cuda_drop_async(tmp_lwe_after_ks, stream);
    }
@@ -850,8 +884,10 @@ template <typename Torus> struct int_are_all_block_true_buffer {
  COMPARISON_TYPE op;
  int_radix_params params;

-  int_radix_lut<Torus> *is_max_value_lut;
-  int_radix_lut<Torus> *is_equal_to_num_blocks_lut;
+  // This map store LUTs that checks the equality between some input and values
+  // of interest in are_all_block_true(), as with max_value (the maximum message
+  // value).
+  std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;

  Torus *tmp_block_accumulated;

@@ -869,34 +905,14 @@ template <typename Torus> struct int_are_all_block_true_buffer {
      int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
      tmp_block_accumulated = (Torus *)cuda_malloc_async(
          (params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus), stream);
-
-      // LUT
-      // We need three LUTs:
-      // (x & max_value as u64) == max_value
-      // x != 0
-      // (x & max_value as u64) == blocks.len()
-
-      auto is_max_value_lut_f = [total_modulus](Torus x) -> Torus {
-        Torus max_value = total_modulus - 1;
-        return (x & max_value) == max_value;
-      };
-
-      is_max_value_lut = new int_radix_lut<Torus>(
-          stream, params, 1, num_radix_blocks, allocate_gpu_memory);
-      is_equal_to_num_blocks_lut = new int_radix_lut<Torus>(
-          stream, params, 1, num_radix_blocks, allocate_gpu_memory);
-      generate_device_accumulator<Torus>(
-          stream, is_max_value_lut->lut, params.glwe_dimension,
-          params.polynomial_size, params.message_modulus, params.carry_modulus,
-          is_max_value_lut_f);
    }
  }

  void release(cuda_stream_t *stream) {
-    is_max_value_lut->release(stream);
-    delete is_max_value_lut;
-    is_equal_to_num_blocks_lut->release(stream);
-    delete is_equal_to_num_blocks_lut;
+    for (auto &lut : is_equal_to_lut_map) {
+      lut.second->release(stream);
+    }
+    is_equal_to_lut_map.clear();

    cuda_drop_async(tmp_block_accumulated, stream);
  }
@@ -1153,7 +1169,7 @@ template <typename Torus> struct int_comparison_buffer {
          stream);

      tmp_packed_input = (Torus *)cuda_malloc_async(
-          (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
+          (params.big_lwe_dimension + 1) * 2 * num_radix_blocks * sizeof(Torus),
          stream);

      // Block comparisons
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -106,23 +106,23 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
  cudaSetDevice(stream->gpu_index);
  constexpr int ideal_threads = 128;

-  int lwe_dim = lwe_dimension_out + 1;
+  int lwe_size = lwe_dimension_out + 1;
  int lwe_lower, lwe_upper, cutoff;
-  if (lwe_dim % ideal_threads == 0) {
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = lwe_dim / ideal_threads;
+  if (lwe_size % ideal_threads == 0) {
+    lwe_lower = lwe_size / ideal_threads;
+    lwe_upper = lwe_size / ideal_threads;
    cutoff = 0;
  } else {
-    int y =
-        ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
+    int y = ceil((double)lwe_size / (double)ideal_threads) * ideal_threads -
+            lwe_size;
    cutoff = ideal_threads - y;
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
+    lwe_lower = lwe_size / ideal_threads;
+    lwe_upper = (int)ceil((double)lwe_size / (double)ideal_threads);
  }

-  int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
+  int lwe_size_after = lwe_size * num_samples;

-  int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
+  int shared_mem = sizeof(Torus) * lwe_size;

  cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
  check_cuda_error(cudaGetLastError());
@@ -130,11 +130,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
  dim3 grid(num_samples, 1, 1);
  dim3 threads(ideal_threads, 1, 1);

-  //    cudaFuncSetAttribute(keyswitch<Torus>,
-  //                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-  //                         shared_mem);
-
-  keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
+  keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
      lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
      lwe_upper, cutoff);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -228,3 +228,15 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
 }

 void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
+
+void cuda_stream_add_callback(cuda_stream_t *stream,
+                              cudaStreamCallback_t callback, void *user_data) {
+
+  check_cuda_error(
+      cudaStreamAddCallback(stream->stream, callback, user_data, 0));
+}
+
+void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
+                                  void *host_pointer) {
+  free(host_pointer);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -99,29 +99,34 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
      input_blocks += (big_lwe_dimension + 1) * chunk_length;
    }
    accumulator = are_all_block_true_buffer->tmp_block_accumulated;
+    auto is_equal_to_num_blocks_map =
+        &are_all_block_true_buffer->is_equal_to_lut_map;

    // Selects a LUT
    int_radix_lut<Torus> *lut;
    if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
      // is_non_zero_lut_buffer LUT
      lut = mem_ptr->eq_buffer->is_non_zero_lut;
-    } else if (chunk_length == max_value) {
-      // is_max_value LUT
-      lut = are_all_block_true_buffer->is_max_value_lut;
    } else {
-      // is_equal_to_num_blocks LUT
-      lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
-      if (chunk_length != lut_num_blocks) {
+      if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
+          (*is_equal_to_num_blocks_map).end()) {
+        // The LUT is already computed
+        lut = (*is_equal_to_num_blocks_map)[chunk_length];
+      } else {
+        // LUT needs to be computed
+        auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
+                                                num_radix_blocks, true);
+
        auto is_equal_to_num_blocks_lut_f = [max_value,
                                             chunk_length](Torus x) -> Torus {
          return (x & max_value) == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
-            carry_modulus, is_equal_to_num_blocks_lut_f);
+            stream, new_lut->lut, glwe_dimension, polynomial_size,
+            message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);

-        // We don't have to generate this lut again
-        lut_num_blocks = chunk_length;
+        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
+        lut = new_lut;
      }
    }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -88,10 +88,21 @@ void cleanup_cuda_full_propagation(cuda_stream_t *stream,
  cuda_drop_async(mem_ptr->lut_buffer, stream);
  cuda_drop_async(mem_ptr->lut_indexes, stream);

-  cuda_drop_async(mem_ptr->pbs_buffer, stream);
-
  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
+
+  switch (mem_ptr->pbs_type) {
+  case LOW_LAT: {
+    auto x = (pbs_buffer<uint64_t, LOW_LAT> *)(mem_ptr->pbs_buffer);
+    x->release(stream);
+  } break;
+  case MULTI_BIT: {
+    auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
+    x->release(stream);
+  } break;
+  default:
+    PANIC("Cuda error (PBS): unsupported implementation variant.")
+  }
 }

 void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -120,7 +120,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(

  execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
                     lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
-                     bsk, lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
+                     bsk, lut->buffer, glwe_dimension, small_lwe_dimension,
                     polynomial_size, pbs_base_log, pbs_level, grouping_factor,
                     num_radix_blocks, 1, 0,
                     cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
@@ -239,8 +239,8 @@ void generate_device_accumulator_bivariate(
      acc_bivariate, h_lut,
      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);

-  cuda_synchronize_stream(stream);
-  free(h_lut);
+  // Release memory when possible
+  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
 }

 /*
@@ -271,8 +271,8 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
      stream);

-  cuda_synchronize_stream(stream);
-  free(h_lut);
+  // Release memory when possible
+  cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
 }

 template <typename Torus>
@@ -461,8 +461,8 @@ void scratch_cuda_full_propagation(
      h_lwe_indexes[i] = i;
    cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
                             stream);
-    cuda_synchronize_stream(stream);
-    free(h_lwe_indexes);
+    cuda_stream_add_callback(stream, host_free_on_stream_callback,
+                             h_lwe_indexes);
  }

  // Temporary arrays
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -402,18 +402,18 @@ __host__ void host_integer_mult_radix_kb(
        polynomial_size * glwe_dimension, lwe_dimension,
        mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);

-    execute_pbs<Torus>(
-        stream, message_blocks_vector, lwe_indexes, luts_message->lut,
-        luts_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
-        luts_message->pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, mem_ptr->params.pbs_base_log,
-        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-        message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
+    execute_pbs<Torus>(stream, message_blocks_vector, lwe_indexes,
+                       luts_message->lut, luts_message->lut_indexes,
+                       small_lwe_vector, lwe_indexes, bsk, luts_message->buffer,
+                       glwe_dimension, lwe_dimension, polynomial_size,
+                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+                       mem_ptr->params.grouping_factor, message_count, 1, 0,
+                       max_shared_memory, mem_ptr->params.pbs_type);

    execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
                       luts_carry->lut, luts_carry->lut_indexes,
                       &small_lwe_vector[message_count * (lwe_dimension + 1)],
-                       lwe_indexes, bsk, luts_carry->pbs_buffer, glwe_dimension,
+                       lwe_indexes, bsk, luts_carry->buffer, glwe_dimension,
                       lwe_dimension, polynomial_size,
                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
                       mem_ptr->params.grouping_factor, carry_count, 1, 0,
@@ -457,7 +457,7 @@ __host__ void host_integer_mult_radix_kb(
  cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);

  host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
-                big_lwe_size, num_blocks);
+                big_lwe_dimension, num_blocks);

  host_propagate_single_carry_low_latency<Torus>(
      stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu
@@ -13,7 +13,7 @@ uint64_t get_buffer_size_bootstrap_amortized_64(

 /*
 * This scratch function allocates the necessary amount of data on the GPU for
- * the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
+ * the amortized PBS on 32 bits inputs, into `buffer`. It also
 * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
 * be used.
 */
@@ -67,7 +67,7 @@ void scratch_cuda_bootstrap_amortized_32(

 /*
 * This scratch function allocates the necessary amount of data on the GPU for
- * the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
+ * the amortized PBS on 64 bits inputs, into `buffer`. It also
 * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
 * be used.
 */
@@ -355,7 +355,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(

 /*
 * This cleanup function frees the data for the amortized PBS on GPU in
- * pbs_buffer for 32 or 64 bits inputs.
+ * buffer for 32 or 64 bits inputs.
 */
 void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
                                      int8_t **pbs_buffer) {
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_low_latency.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_low_latency.cuh
@@ -245,51 +245,10 @@ __global__ void device_bootstrap_fast_low_latency(
  }
 }

-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_bootstrap_fast_low_latency(
-    uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
-      polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
-          polynomial_size);
-  uint64_t partial_dm = full_sm - partial_sm;
-  uint64_t full_dm = full_sm;
-  uint64_t device_mem = 0;
-  if (max_shared_memory < partial_sm) {
-    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                 (glwe_dimension + 1);
-  } else if (max_shared_memory < full_sm) {
-    device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                 (glwe_dimension + 1);
-  }
-  uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
-                                          input_lwe_ciphertext_count *
-                                          polynomial_size / 2 * sizeof(double2);
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
 template <typename Torus, typename STorus, typename params>
 __host__ void scratch_bootstrap_fast_low_latency(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
+    cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
  cudaSetDevice(stream->gpu_index);
@@ -316,13 +275,10 @@ __host__ void scratch_bootstrap_fast_low_latency(
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
-    check_cuda_error(cudaGetLastError());
-  }
+
+  *buffer = new pbs_buffer<Torus, LOW_LAT>(
+      stream, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, PBS_VARIANT::FAST, allocate_gpu_memory);
 }

 /*
@@ -333,11 +289,11 @@ template <typename Torus, class params>
 __host__ void host_bootstrap_fast_low_latency(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
-    uint32_t max_shared_memory) {
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t num_luts, uint32_t max_shared_memory) {
  cudaSetDevice(stream->gpu_index);

  // With SM each block corresponds to either the mask or body, no need to
@@ -353,15 +309,8 @@ __host__ void host_bootstrap_fast_low_latency(

  uint64_t partial_dm = full_dm - partial_sm;

-  int8_t *d_mem = pbs_buffer;
-  double2 *buffer_fft =
-      (double2 *)d_mem +
-      (ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
-                      glwe_dimension, polynomial_size, level_count,
-                      input_lwe_ciphertext_count, max_shared_memory) /
-                      sizeof(double2) -
-                  (glwe_dimension + 1) * level_count *
-                      input_lwe_ciphertext_count * polynomial_size / 2);
+  int8_t *d_mem = buffer->d_mem;
+  double2 *buffer_fft = buffer->global_accumulator_fft;

  int thds = polynomial_size / params::opt;
  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
@@ -436,12 +385,12 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
-        thds, 0);
+        thds, partial_sm);
  } else {
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &max_active_blocks_per_sm,
        (void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
-        0);
+        full_sm);
  }

  // Get the number of streaming multiprocessors
@@ -450,4 +399,46 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }

+// Verify if the grid size for the low latency kernel satisfies the cooperative
+// group constraints
+template <typename Torus>
+__host__ bool supports_cooperative_groups_on_lowlat_pbs(
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+  switch (polynomial_size) {
+  case 256:
+    return verify_cuda_bootstrap_fast_low_latency_grid_size<
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
+  case 512:
+    return verify_cuda_bootstrap_fast_low_latency_grid_size<
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
+                                     max_shared_memory);
+  case 1024:
+    return verify_cuda_bootstrap_fast_low_latency_grid_size<
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 2048:
+    return verify_cuda_bootstrap_fast_low_latency_grid_size<
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 4096:
+    return verify_cuda_bootstrap_fast_low_latency_grid_size<
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 8192:
+    return verify_cuda_bootstrap_fast_low_latency_grid_size<
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 16384:
+    return verify_cuda_bootstrap_fast_low_latency_grid_size<
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
+                                       max_shared_memory);
+  default:
+    PANIC("Cuda error (low latency PBS): unsupported polynomial size. "
+          "Supported N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
 #endif // LOWLAT_FAST_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
@@ -154,11 +154,11 @@ __host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(

 template <typename Torus, typename STorus, typename params>
 __host__ void scratch_fast_multi_bit_pbs(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0) {
+    cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t grouping_factor, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {

  cudaSetDevice(stream->gpu_index);

@@ -183,30 +183,25 @@ __host__ void scratch_fast_multi_bit_pbs(
      cudaFuncCachePreferShared);
  check_cuda_error(cudaGetLastError());

-  if (allocate_gpu_memory) {
-    if (!lwe_chunk_size)
-      lwe_chunk_size =
-          get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
-                                     input_lwe_ciphertext_count);
-
-    uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
-        lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
-        max_shared_memory);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
-    check_cuda_error(cudaGetLastError());
-  }
+  if (!lwe_chunk_size)
+    lwe_chunk_size = get_average_lwe_chunk_size(
+        lwe_dimension, level_count, glwe_dimension, input_lwe_ciphertext_count);
+  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
+      stream, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::FAST,
+      allocate_gpu_memory);
 }

 template <typename Torus, typename STorus, class params>
 __host__ void host_fast_multi_bit_pbs(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
+    Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0) {
  cudaSetDevice(stream->gpu_index);

  if (!lwe_chunk_size)
@@ -214,15 +209,9 @@ __host__ void host_fast_multi_bit_pbs(
                                                glwe_dimension, num_samples);

  //
-  double2 *keybundle_fft = (double2 *)pbs_buffer;
-  double2 *buffer_fft = (double2 *)keybundle_fft +
-                        num_samples * lwe_chunk_size * level_count *
-                            (glwe_dimension + 1) * (glwe_dimension + 1) *
-                            (polynomial_size / 2);
-  Torus *global_accumulator =
-      (Torus *)buffer_fft +
-      (ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
-                  level_count * (polynomial_size / 2) / sizeof(Torus));
+  double2 *keybundle_fft = pbs_buffer->keybundle_fft;
+  Torus *global_accumulator = pbs_buffer->global_accumulator;
+  double2 *buffer_fft = pbs_buffer->global_accumulator_fft;

  //
  uint64_t full_sm_keybundle =
@@ -318,4 +307,46 @@ verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
 }
+
+// Verify if the grid size for the multi-bit kernel satisfies the cooperative
+// group constraints
+template <typename Torus>
+__host__ bool supports_cooperative_groups_on_multibit_pbs(
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+  switch (polynomial_size) {
+  case 256:
+    return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
+                                                          AmortizedDegree<256>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
+  case 512:
+    return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
+                                                          AmortizedDegree<512>>(
+        glwe_dimension, level_count, num_samples, max_shared_memory);
+  case 1024:
+    return verify_cuda_bootstrap_fast_multi_bit_grid_size<
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 2048:
+    return verify_cuda_bootstrap_fast_multi_bit_grid_size<
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 4096:
+    return verify_cuda_bootstrap_fast_multi_bit_grid_size<
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 8192:
+    return verify_cuda_bootstrap_fast_multi_bit_grid_size<
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
+                                      max_shared_memory);
+  case 16384:
+    return verify_cuda_bootstrap_fast_multi_bit_grid_size<
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
+                                       max_shared_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
 #endif // FASTMULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cu
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cuh
@@ -221,27 +221,6 @@ __global__ void device_bootstrap_low_latency_step_two(
  }
 }

-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_bootstrap_low_latency_step_one(
-    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_bootstrap_low_latency_step_two(
-    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
 template <typename Torus>
 __host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -285,8 +264,8 @@ __host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(

 template <typename Torus, typename STorus, typename params>
 __host__ void scratch_bootstrap_low_latency(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
+    cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
    bool allocate_gpu_memory) {
  cudaSetDevice(stream->gpu_index);
@@ -338,13 +317,9 @@ __host__ void scratch_bootstrap_low_latency(
    check_cuda_error(cudaGetLastError());
  }

-  if (allocate_gpu_memory) {
-    uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
-    check_cuda_error(cudaGetLastError());
-  }
+  *buffer = new pbs_buffer<Torus, LOW_LAT>(
+      stream, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
 }

 template <typename Torus, class params>
@@ -432,11 +407,11 @@ template <typename Torus, class params>
 __host__ void host_bootstrap_low_latency(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
-    uint32_t max_shared_memory) {
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    pbs_buffer<Torus, LOW_LAT> *pbs_buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t num_luts, uint32_t max_shared_memory) {
  cudaSetDevice(stream->gpu_index);

  // With SM each block corresponds to either the mask or body, no need to
@@ -456,16 +431,9 @@ __host__ void host_bootstrap_low_latency(
  uint64_t full_dm_step_one = full_sm_step_one;
  uint64_t full_dm_step_two = full_sm_step_two;

-  double2 *global_accumulator_fft = (double2 *)pbs_buffer;
-  Torus *global_accumulator =
-      (Torus *)global_accumulator_fft +
-      (ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
-                  input_lwe_ciphertext_count * (polynomial_size / 2) /
-                  sizeof(Torus));
-  int8_t *d_mem = (int8_t *)global_accumulator +
-                  (ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
-                              input_lwe_ciphertext_count * polynomial_size /
-                              sizeof(int8_t));
+  Torus *global_accumulator = pbs_buffer->global_accumulator;
+  double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
+  int8_t *d_mem = pbs_buffer->d_mem;

  for (int i = 0; i < lwe_dimension; i++) {
    execute_low_latency_step_one<Torus, params>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cu
@@ -3,14 +3,26 @@
 #include "bootstrap_multibit.cuh"
 #include "bootstrap_multibit.h"

-void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
-    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
-    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
+bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
+                                                  uint32_t polynomial_size,
+                                                  uint32_t level_count,
+                                                  uint32_t num_samples,
+                                                  uint32_t max_shared_memory) {
+  return supports_cooperative_groups_on_multibit_pbs<uint64_t>(
+      glwe_dimension, polynomial_size, level_count, num_samples,
+      max_shared_memory);
+}
+
+template <typename Torus>
+void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size) {

  if (base_log > 64)
    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -18,207 +30,292 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(

  switch (polynomial_size) {
  case 256:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<256>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
+    host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
    break;
  case 512:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<512>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
+    host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<512>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
    break;
  case 1024:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<1024>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
+    host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<1024>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
    break;
  case 2048:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<2048>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
+    host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<2048>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
    break;
  case 4096:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<4096>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
+    host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<4096>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
    break;
  case 8192:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<8192>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
+    host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<8192>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
    break;
  case 16384:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<16384>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          stream, static_cast<uint64_t *>(lwe_array_out),
-          static_cast<uint64_t *>(lwe_output_indexes),
-          static_cast<uint64_t *>(lut_vector),
-          static_cast<uint64_t *>(lut_vector_indexes),
-          static_cast<uint64_t *>(lwe_array_in),
-          static_cast<uint64_t *>(lwe_input_indexes),
-          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_luts, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
+    host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<16384>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+template <typename Torus>
+void cuda_multi_bit_pbs_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, Torus *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size) {
+
+  if (base_log > 64)
+    PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
+          "the ciphertext representation (64)");
+
+  switch (polynomial_size) {
+  case 256:
+    host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 512:
+    host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<512>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 1024:
+    host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<1024>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 2048:
+    host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<2048>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 4096:
+    host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<4096>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 8192:
+    host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<8192>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  case 16384:
+    host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<16384>>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+        pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
+        max_shared_memory, lwe_chunk_size);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
+
+  if (supports_cooperative_groups_on_multibit_pbs<uint64_t>(
+          glwe_dimension, polynomial_size, level_count, num_samples,
+          max_shared_memory))
+    cuda_fast_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_output_indexes),
+        static_cast<uint64_t *>(lut_vector),
+        static_cast<uint64_t *>(lut_vector_indexes),
+        static_cast<uint64_t *>(lwe_array_in),
+        static_cast<uint64_t *>(lwe_input_indexes),
+        static_cast<uint64_t *>(bootstrapping_key),
+        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+  else
+    cuda_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_output_indexes),
+        static_cast<uint64_t *>(lut_vector),
+        static_cast<uint64_t *>(lut_vector_indexes),
+        static_cast<uint64_t *>(lwe_array_in),
+        static_cast<uint64_t *>(lwe_input_indexes),
+        static_cast<uint64_t *>(bootstrapping_key),
+        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+}
+
+template <typename Torus, typename STorus>
+void scratch_cuda_fast_multi_bit_pbs(
+    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<256>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 512:
+    scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<512>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 1024:
+    scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<1024>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 2048:
+    scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<2048>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 4096:
+    scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<4096>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 8192:
+    scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<8192>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 16384:
+    scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<16384>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..16384].")
+  }
+}
+
+template <typename Torus, typename STorus>
+void scratch_cuda_multi_bit_pbs(
+    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<256>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 512:
+    scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<512>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 1024:
+    scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<1024>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 2048:
+    scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<2048>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 4096:
+    scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<4096>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 8192:
+    scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<8192>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    break;
+  case 16384:
+    scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<16384>>(
+        stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, grouping_factor,
+        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
    break;
  default:
    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -228,136 +325,35 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
 }

 void scratch_cuda_multi_bit_pbs_64(
-    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
+    cuda_stream_t *stream, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t max_shared_memory, bool allocate_gpu_memory,
    uint32_t lwe_chunk_size) {

-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<256>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    }
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<512>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    }
-    break;
-  case 1024:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<1024>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    }
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<2048>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    }
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<4096>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    }
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<8192>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    }
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<16384>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, grouping_factor,
-          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
-    }
-    break;
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
-          "N's are powers of two"
-          " in the interval [256..16384].")
-  }
+  if (supports_cooperative_groups_on_multibit_pbs<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory))
+    scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
+        stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, level_count, grouping_factor,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
+        lwe_chunk_size);
+  else
+    scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
+        stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, level_count, grouping_factor,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
+        lwe_chunk_size);
 }

-void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer) {
-
-  // Free memory
-  cuda_drop_async(*pbs_buffer, stream);
+void cleanup_cuda_multi_bit_pbs_32(cuda_stream_t *stream, int8_t **buffer) {
+  auto x = (pbs_buffer<uint32_t, MULTI_BIT> *)(*buffer);
+  x->release(stream);
+}
+void cleanup_cuda_multi_bit_pbs_64(cuda_stream_t *stream, int8_t **buffer) {
+  auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
+  x->release(stream);
 }

 // Pick the best possible chunk size for each GPU
@@ -423,7 +419,12 @@ __host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
      return 9;
  } else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
    // Tesla H100
-    return 45;
+    if (num_samples < 1024)
+      return 128;
+    else if (num_samples < 4096)
+      return 64;
+    else
+      return 32;
  }

  // Generic case
@@ -451,11 +452,11 @@ __host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
    return (ct_count > 10000) ? 30 : 45;
  } else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
    // Tesla H100
-    return (ct_count > 10000) ? 30 : 45;
+    return 64;
  }

  // Generic case
-  return (ct_count > 10000) ? 2 : 10;
+  return (ct_count > 10000) ? 2 : 1;
 }

 // Returns the maximum buffer size required to execute batches up to
@@ -469,14 +470,51 @@ __host__ uint64_t get_max_buffer_size_multibit_bootstrap(
  for (uint32_t input_lwe_ciphertext_count = 1;
       input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
       input_lwe_ciphertext_count *= 2) {
-    max_buffer_size = std::max(
-        max_buffer_size,
-        get_buffer_size_multibit_bootstrap<uint64_t>(
-            glwe_dimension, polynomial_size, level_count,
-            input_lwe_ciphertext_count,
-            get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
-                               input_lwe_ciphertext_count)));
+    max_buffer_size =
+        std::max(max_buffer_size,
+                 get_buffer_size_multibit_bootstrap<uint64_t>(
+                     glwe_dimension, polynomial_size, level_count,
+                     input_lwe_ciphertext_count,
+                     get_average_lwe_chunk_size(lwe_dimension, level_count,
+                                                glwe_dimension,
+                                                input_lwe_ciphertext_count)));
  }

  return max_buffer_size;
 }
+
+template void scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
+    cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+
+template void cuda_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
+    cuda_stream_t *stream, uint64_t *lwe_array_out,
+    uint64_t *lwe_output_indexes, uint64_t *lut_vector,
+    uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
+    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
+    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size);
+
+template void scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
+    cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+
+template void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
+    cuda_stream_t *stream, uint64_t *lwe_array_out,
+    uint64_t *lwe_output_indexes, uint64_t *lut_vector,
+    uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
+    uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
+    pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cuh
@@ -329,13 +329,12 @@ __host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
 }

 template <typename Torus, typename STorus, typename params>
-__host__ void
-scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
-                      uint32_t lwe_dimension, uint32_t glwe_dimension,
-                      uint32_t polynomial_size, uint32_t level_count,
-                      uint32_t input_lwe_ciphertext_count,
-                      uint32_t grouping_factor, uint32_t max_shared_memory,
-                      bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
+__host__ void scratch_multi_bit_pbs(
+    cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t grouping_factor, uint32_t max_shared_memory,
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {

  cudaSetDevice(stream->gpu_index);

@@ -374,29 +373,25 @@ scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
      cudaFuncCachePreferShared);
  check_cuda_error(cudaGetLastError());

-  if (allocate_gpu_memory) {
-    if (!lwe_chunk_size)
-      lwe_chunk_size =
-          get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
-                                     input_lwe_ciphertext_count);
-
-    uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, lwe_chunk_size);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
-    check_cuda_error(cudaGetLastError());
-  }
+  if (!lwe_chunk_size)
+    lwe_chunk_size = get_average_lwe_chunk_size(
+        lwe_dimension, level_count, glwe_dimension, input_lwe_ciphertext_count);
+  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
+      stream, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
+      allocate_gpu_memory);
 }

 template <typename Torus, typename STorus, class params>
 __host__ void host_multi_bit_pbs(
    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
-    Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
+    Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
+    pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0) {
  cudaSetDevice(stream->gpu_index);

  // If a chunk size is not passed to this function, select one.
@@ -404,15 +399,9 @@ __host__ void host_multi_bit_pbs(
    lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
                                                glwe_dimension, num_samples);
  //
-  double2 *keybundle_fft = (double2 *)pbs_buffer;
-  double2 *global_accumulator_fft =
-      (double2 *)keybundle_fft +
-      num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
-          (glwe_dimension + 1) * (polynomial_size / 2);
-  Torus *global_accumulator =
-      (Torus *)global_accumulator_fft +
-      (ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
-                  level_count * (polynomial_size / 2) / sizeof(Torus));
+  double2 *keybundle_fft = buffer->keybundle_fft;
+  Torus *global_accumulator = buffer->global_accumulator;
+  double2 *global_accumulator_fft = buffer->global_accumulator_fft;

  //
  uint64_t full_sm_keybundle =
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -100,11 +100,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,

  cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);

-  double2 *buffer;
+  double2 *buffer = (double2 *)cuda_malloc_async(0, stream);
  switch (polynomial_size) {
  case 256:
    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -123,7 +122,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
    break;
  case 512:
    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -142,7 +140,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
    break;
  case 1024:
    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -161,7 +158,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
    break;
  case 2048:
    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -180,7 +176,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
    break;
  case 4096:
    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -199,7 +194,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
    break;
  case 8192:
    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -218,7 +212,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
    break;
  case 16384:
    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream);
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -236,7 +229,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
    }
    break;
  default:
-    break;
+    PANIC("Cuda error (convert KSK): unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
  }

  cuda_drop_async(d_bsk, stream);
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/CMakeLists.txt
@@ -0,0 +1,12 @@
+option(TFHE_CUDA_BACKEND_BUILD_TESTS "Build the test tool" OFF)
+option(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS "Build the benchmark tool" OFF)
+
+if(TFHE_CUDA_BACKEND_BUILD_TESTS)
+  message(STATUS "Building the test tool")
+  add_subdirectory(tests)
+endif()
+
+if(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS)
+  message(STATUS "Building the benchmark tool")
+  add_subdirectory(benchmarks)
+endif()
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/CMakeLists.txt
@@ -0,0 +1,88 @@
+project(benchmark_tfhe_cuda_backend LANGUAGES CXX)
+
+# See if the minimum CUDA version is available. If not, only enable documentation building.
+set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
+include(CheckLanguage)
+# See if CUDA is available
+check_language(CUDA)
+# If so, enable CUDA to check the version.
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  find_package(CUDAToolkit)
+endif()
+# If CUDA is not available, or the minimum version is too low do not build
+if(NOT CMAKE_CUDA_COMPILER)
+  message(FATAL_ERROR "Cuda compiler not found.")
+endif()
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Disable the Google Benchmark requirement on Google Test
+set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
+set(BENCHMARK_ENABLE_TESTING OFF)
+
+include(FetchContent)
+FetchContent_Declare(
+  googlebenchmark
+  GIT_REPOSITORY https://github.com/google/benchmark.git
+  GIT_TAG v1.7.1)
+FetchContent_MakeAvailable(googlebenchmark)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
+set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
+
+if(NOT TARGET tfhe-rs)
+  ExternalProject_Add(
+    tfhe-rs
+    SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_ALWAYS 1
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    DOWNLOAD_COMMAND ""
+    BUILD_COMMAND make build_c_api
+    INSTALL_COMMAND ""
+    LOG_BUILD ON)
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
+include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
+include_directories(${CONCRETE_CUDA_SOURCE_DIR}/src)
+include_directories(${TFHE_RS_BINARY_DIR})
+include_directories(${TFHE_RS_BINARY_DIR}/deps)
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+find_package(OpenMP REQUIRED)
+# Add the OpenMP flag to the compiler flags
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+add_library(tfhe_rs_lib STATIC IMPORTED)
+add_dependencies(tfhe_rs_lib tfhe-rs)
+set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
+
+set(BINARY benchmark_tfhe_cuda_backend)
+
+file(
+  GLOB_RECURSE BENCH_SOURCES
+  LIST_DIRECTORIES false
+  benchmark*.cpp main.cpp)
+
+add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
+
+set_target_properties(benchmark_tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS
+                                                                                           ON)
+target_link_libraries(
+  benchmark_tfhe_cuda_backend
+  PUBLIC benchmark::benchmark tfhe_rs_lib tfhe_cuda_backend OpenMP::OpenMP_CXX
+  PRIVATE CUDA::cudart)
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_fft.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_fft.cpp
@@ -0,0 +1,73 @@
+#include <benchmark/benchmark.h>
+#include <cstdint>
+#include <setup_and_teardown.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct {
+  size_t polynomial_size;
+  int samples;
+} FourierTransformTestParams;
+
+class FourierTransformTestPrimitives_u64 : public benchmark::Fixture {
+protected:
+  size_t polynomial_size;
+  int num_samples;
+  cuda_stream_t *stream;
+  int gpu_index = 0;
+
+  double *poly1;
+  double *poly2; // will be used as extracted result for cuda mult
+  double2 *h_cpoly1;
+  double2 *h_cpoly2; // will be used as a result poly
+  double2 *d_cpoly1;
+  double2 *d_cpoly2; // will be used as a result poly
+
+public:
+  void SetUp(const ::benchmark::State &state) {
+    stream = cuda_create_stream(0);
+
+    // get test params
+    polynomial_size = state.range(0);
+    num_samples = state.range(1);
+
+    fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
+              &d_cpoly2, polynomial_size, num_samples);
+  }
+
+  void TearDown(const ::benchmark::State &state) {
+    fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2);
+  }
+};
+
+BENCHMARK_DEFINE_F(FourierTransformTestPrimitives_u64, cuda_fft_mult)
+(benchmark::State &st) {
+
+  for (auto _ : st) {
+    cuda_fourier_polynomial_mul(d_cpoly1, d_cpoly2, d_cpoly2, stream,
+                                polynomial_size, num_samples);
+    cuda_synchronize_stream(stream);
+  }
+}
+
+static void FFTBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
+  // Define the parameters to benchmark
+  // n, input_lwe_ciphertext_count
+  std::vector<FourierTransformTestParams> params = {
+      (FourierTransformTestParams){256, 100},
+      (FourierTransformTestParams){512, 100},
+      (FourierTransformTestParams){1024, 100},
+      (FourierTransformTestParams){2048, 100},
+      (FourierTransformTestParams){4096, 100},
+      (FourierTransformTestParams){8192, 100},
+      (FourierTransformTestParams){16384, 100},
+  };
+
+  // Add to the list of parameters to benchmark
+  for (auto x : params)
+    b->Args({x.polynomial_size, x.samples});
+}
+
+BENCHMARK_REGISTER_F(FourierTransformTestPrimitives_u64, cuda_fft_mult)
+    ->Apply(FFTBenchmarkGenerateParams)
+    ->ArgNames({"polynomial_size", "samples"});
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -0,0 +1,372 @@
+#include <benchmark/benchmark.h>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <omp.h>
+#include <setup_and_teardown.h>
+
+typedef struct {
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  int pbs_base_log;
+  int pbs_level;
+  int input_lwe_ciphertext_count;
+  int grouping_factor;
+  int chunk_size;
+} MultiBitPBSBenchmarkParams;
+
+typedef struct {
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  int pbs_base_log;
+  int pbs_level;
+  int input_lwe_ciphertext_count;
+} BootstrapBenchmarkParams;
+
+class MultiBitBootstrap_u64 : public benchmark::Fixture {
+protected:
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  int input_lwe_ciphertext_count;
+  int grouping_factor;
+  DynamicDistribution lwe_modular_variance;
+  DynamicDistribution glwe_modular_variance;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus = 4;
+  int carry_modulus = 4;
+  int payload_modulus;
+  uint64_t delta;
+  cuda_stream_t *stream;
+  uint64_t *lwe_sk_in_array;
+  uint64_t *lwe_sk_out_array;
+  uint64_t *plaintexts;
+  uint64_t *d_bsk;
+  uint64_t *d_lut_pbs_identity;
+  uint64_t *d_lut_pbs_indexes;
+  uint64_t *d_lwe_ct_in_array;
+  uint64_t *d_lwe_ct_out_array;
+  uint64_t *lwe_ct_out_array;
+  uint64_t *d_lwe_input_indexes;
+  uint64_t *d_lwe_output_indexes;
+  int8_t *buffer;
+
+  int chunk_size;
+
+public:
+  void SetUp(const ::benchmark::State &state) {
+    int gpu_index = 0;
+    stream = cuda_create_stream(gpu_index);
+
+    lwe_dimension = state.range(0);
+    glwe_dimension = state.range(1);
+    polynomial_size = state.range(2);
+    pbs_base_log = state.range(3);
+    pbs_level = state.range(4);
+    input_lwe_ciphertext_count = state.range(5);
+    grouping_factor = state.range(6);
+    chunk_size = state.range(7);
+
+    DynamicDistribution lwe_modular_variance =
+        new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
+    DynamicDistribution glwe_modular_variance =
+        new_gaussian_from_std_dev(sqrt(0.00000000000000029403601535432533));
+
+    Seed seed;
+    init_seed(&seed);
+
+    bootstrap_multibit_setup(
+        stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk, &plaintexts,
+        &d_lut_pbs_identity, &d_lut_pbs_indexes, &d_lwe_ct_in_array,
+        &d_lwe_input_indexes, &d_lwe_ct_out_array, &d_lwe_output_indexes,
+        &buffer, lwe_dimension, glwe_dimension, polynomial_size,
+        grouping_factor, lwe_modular_variance, glwe_modular_variance,
+        pbs_base_log, pbs_level, message_modulus, carry_modulus,
+        &payload_modulus, &delta, input_lwe_ciphertext_count, 1, 1);
+  }
+
+  void TearDown(const ::benchmark::State &state) {
+    bootstrap_multibit_teardown(
+        stream, lwe_sk_in_array, lwe_sk_out_array, d_bsk, plaintexts,
+        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
+        d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
+    cudaDeviceReset();
+  }
+};
+
+class ClassicalBootstrap_u64 : public benchmark::Fixture {
+protected:
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  int input_lwe_ciphertext_count;
+  DynamicDistribution lwe_modular_variance;
+  DynamicDistribution glwe_modular_variance;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus = 4;
+  int carry_modulus = 4;
+  int payload_modulus;
+  uint64_t delta;
+  double *d_fourier_bsk;
+  uint64_t *d_lut_pbs_identity;
+  uint64_t *d_lut_pbs_indexes;
+  uint64_t *d_lwe_input_indexes;
+  uint64_t *d_lwe_output_indexes;
+  uint64_t *d_lwe_ct_in_array;
+  uint64_t *d_lwe_ct_out_array;
+  uint64_t *lwe_ct_array;
+  uint64_t *lwe_sk_in_array;
+  uint64_t *lwe_sk_out_array;
+  uint64_t *plaintexts;
+  int8_t *buffer;
+
+  cuda_stream_t *stream;
+
+public:
+  void SetUp(const ::benchmark::State &state) {
+    int gpu_index = 0;
+    stream = cuda_create_stream(gpu_index);
+
+    lwe_dimension = state.range(0);
+    glwe_dimension = state.range(1);
+    polynomial_size = state.range(2);
+    pbs_base_log = state.range(3);
+    pbs_level = state.range(4);
+    input_lwe_ciphertext_count = state.range(5);
+
+    DynamicDistribution lwe_modular_variance =
+        new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
+    DynamicDistribution glwe_modular_variance =
+        new_gaussian_from_std_dev(sqrt(0.00000000000000029403601535432533));
+
+    Seed seed;
+    init_seed(&seed);
+
+    bootstrap_classical_setup(
+        stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_fourier_bsk,
+        &plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
+        &d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array,
+        &d_lwe_output_indexes, lwe_dimension, glwe_dimension, polynomial_size,
+        lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
+        message_modulus, carry_modulus, &payload_modulus, &delta,
+        input_lwe_ciphertext_count, 1, 1);
+  }
+
+  void TearDown(const ::benchmark::State &state) {
+    bootstrap_classical_teardown(
+        stream, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk, plaintexts,
+        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
+        d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
+
+    cudaDeviceReset();
+  }
+};
+
+BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, FastMultiBit)
+(benchmark::State &st) {
+  if (!has_support_to_cuda_bootstrap_fast_multi_bit(
+          glwe_dimension, polynomial_size, pbs_level,
+          input_lwe_ciphertext_count,
+          cuda_get_max_shared_memory(stream->gpu_index))) {
+    st.SkipWithError("Configuration not supported for fast operation");
+    return;
+  }
+
+  scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
+      stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_level, grouping_factor,
+      input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
+      true, chunk_size);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
+        stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
+        d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
+        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
+        pbs_level, input_lwe_ciphertext_count, 1, 0,
+        cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
+    cuda_synchronize_stream(stream);
+  }
+
+  cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
+}
+
+BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
+(benchmark::State &st) {
+  scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
+      stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
+      glwe_dimension, polynomial_size, pbs_level, grouping_factor,
+      input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
+      true, chunk_size);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_multi_bit_pbs_lwe_ciphertext_vector(
+        stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
+        d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
+        (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
+        pbs_level, input_lwe_ciphertext_count, 1, 0,
+        cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
+    cuda_synchronize_stream(stream);
+  }
+
+  cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
+}
+
+BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, FastLowLatencyPBS)
+(benchmark::State &st) {
+  if (!has_support_to_cuda_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, pbs_level,
+          input_lwe_ciphertext_count,
+          cuda_get_max_shared_memory(stream->gpu_index))) {
+    st.SkipWithError("Configuration not supported for fast operation");
+    return;
+  }
+
+  scratch_cuda_fast_bootstrap_low_latency<uint64_t, int64_t>(
+      stream, (pbs_buffer<uint64_t, LOW_LAT> **)&buffer, glwe_dimension,
+      polynomial_size, pbs_level, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(stream->gpu_index), true);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector<uint64_t>(
+        stream, (uint64_t *)d_lwe_ct_out_array,
+        (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
+        (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
+        (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
+        (pbs_buffer<uint64_t, LOW_LAT> *)buffer, lwe_dimension, glwe_dimension,
+        polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
+        0, cuda_get_max_shared_memory(stream->gpu_index));
+    cuda_synchronize_stream(stream);
+  }
+
+  cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
+}
+
+BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultLowLatencyPBS)
+(benchmark::State &st) {
+
+  scratch_cuda_bootstrap_low_latency<uint64_t, int64_t>(
+      stream, (pbs_buffer<uint64_t, LOW_LAT> **)&buffer, glwe_dimension,
+      polynomial_size, pbs_level, input_lwe_ciphertext_count,
+      cuda_get_max_shared_memory(stream->gpu_index), true);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_bootstrap_low_latency_lwe_ciphertext_vector<uint64_t>(
+        stream, (uint64_t *)d_lwe_ct_out_array,
+        (uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
+        (uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
+        (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
+        (pbs_buffer<uint64_t, LOW_LAT> *)buffer, lwe_dimension, glwe_dimension,
+        polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
+        0, cuda_get_max_shared_memory(stream->gpu_index));
+    cuda_synchronize_stream(stream);
+  }
+
+  cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
+}
+
+BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS)
+(benchmark::State &st) {
+
+  scratch_cuda_bootstrap_amortized_64(
+      stream, &buffer, glwe_dimension, polynomial_size,
+      input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
+      true);
+
+  for (auto _ : st) {
+    // Execute PBS
+    cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+        stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
+        (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
+        (void *)d_lwe_ct_in_array, (void *)d_lwe_input_indexes,
+        (void *)d_fourier_bsk, buffer, lwe_dimension, glwe_dimension,
+        polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
+        0, cuda_get_max_shared_memory(stream->gpu_index));
+    cuda_synchronize_stream(stream);
+  }
+
+  cleanup_cuda_bootstrap_amortized(stream, &buffer);
+}
+
+static void
+MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
+  // Define the parameters to benchmark
+  // lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+  // input_lwe_ciphertext_count
+  std::vector<MultiBitPBSBenchmarkParams> params = {
+      // 4_bits_multi_bit_group_2
+      (MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2, 0},
+      // 4_bits_multi_bit_group_3
+      (MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3, 0},
+  };
+
+  // Add to the list of parameters to benchmark
+  for (auto x : params) {
+    for (int input_lwe_ciphertext_count = 1; input_lwe_ciphertext_count <= 4096;
+         input_lwe_ciphertext_count *= 2) {
+      for (int lwe_chunk_size = 1;
+           lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
+           lwe_chunk_size *= 2)
+        b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
+                 x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
+                 x.grouping_factor, lwe_chunk_size});
+
+      int lwe_chunk_size = x.lwe_dimension / x.grouping_factor;
+      b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
+               x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
+               x.grouping_factor, lwe_chunk_size});
+    }
+  }
+}
+
+static void
+BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
+  // Define the parameters to benchmark
+  // lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+  // input_lwe_ciphertext_count
+
+  // PARAM_MESSAGE_2_CARRY_2_KS_PBS
+  std::vector<BootstrapBenchmarkParams> params = {
+      (BootstrapBenchmarkParams){742, 1, 2048, 23, 1, 1},
+  };
+
+  // Add to the list of parameters to benchmark
+  for (int num_samples = 1; num_samples <= 4096; num_samples *= 2)
+    for (auto x : params) {
+      b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
+               x.pbs_base_log, x.pbs_level, num_samples});
+    }
+}
+
+BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, FastMultiBit)
+    ->Apply(MultiBitPBSBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
+                "grouping_factor", "chunk_size"});
+
+BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
+    ->Apply(MultiBitPBSBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
+                "grouping_factor", "chunk_size"});
+
+BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, DefaultLowLatencyPBS)
+    ->Apply(BootstrapBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
+
+BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
+    ->Apply(BootstrapBenchmarkGenerateParams)
+    ->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
+                "pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/main.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/main.cpp
@@ -0,0 +1,3 @@
+#include <benchmark/benchmark.h>
+
+BENCHMARK_MAIN();
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/setup_and_teardown.h
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/setup_and_teardown.h
@@ -0,0 +1,71 @@
+#ifndef SETUP_AND_TEARDOWN_H
+#define SETUP_AND_TEARDOWN_H
+
+#include <bootstrap.h>
+#include <bootstrap_multibit.h>
+#include <device.h>
+#include <keyswitch.h>
+#include <utils.h>
+
+void bootstrap_classical_setup(
+    cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
+    uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
+    uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
+    uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
+    uint64_t **d_lwe_input_indexes, uint64_t **d_lwe_ct_out_array,
+    uint64_t **d_lwe_output_indexes, int lwe_dimension, int glwe_dimension,
+    int polynomial_size, DynamicDistribution lwe_noise_distribution,
+    DynamicDistribution glwe_noise_distribution, int pbs_base_log,
+    int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
+    uint64_t *delta, int number_of_inputs, int repetitions, int samples);
+void bootstrap_classical_teardown(
+    cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
+    uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
+    uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
+    uint64_t *d_lwe_input_indexes, uint64_t *d_lwe_ct_out_array,
+    uint64_t *d_lwe_output_indexes);
+void bootstrap_multibit_setup(
+    cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
+    uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
+    uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
+    uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
+    uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
+    int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
+    int polynomial_size, int grouping_factor,
+    DynamicDistribution lwe_noise_distribution,
+    DynamicDistribution glwe_noise_distribution, int pbs_base_log,
+    int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
+    uint64_t *delta, int number_of_inputs, int repetitions, int samples,
+    int chunk_size = 0);
+void bootstrap_multibit_teardown(
+    cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
+    uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
+    uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_input_indexes,
+    uint64_t *d_lwe_ct_out_array, uint64_t *d_lwe_output_indexes);
+void keyswitch_setup(
+    cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
+    uint64_t **lwe_sk_out_array, uint64_t **d_ksk_array, uint64_t **plaintexts,
+    uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
+    uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
+    int input_lwe_dimension, int output_lwe_dimension,
+    DynamicDistribution lwe_noise_distribution, int ksk_base_log, int ksk_level,
+    int message_modulus, int carry_modulus, int *payload_modulus,
+    uint64_t *delta, int number_of_inputs, int repetitions, int samples);
+void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
+                        uint64_t *lwe_sk_out_array, uint64_t *d_ksk_array,
+                        uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
+                        uint64_t *lwe_input_indexes,
+                        uint64_t *d_lwe_ct_out_array,
+                        uint64_t *lwe_output_indexes);
+
+void fft_setup(cuda_stream_t *stream, double **poly1, double **poly2,
+               double2 **h_cpoly1, double2 **h_cpoly2, double2 **d_cpoly1,
+               double2 **d_cpoly2, size_t polynomial_size, int samples);
+
+void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
+                  double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
+                  double2 *d_cpoly2);
+
+#endif // SETUP_AND_TEARDOWN_H
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/utils.h
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/include/utils.h
@@ -0,0 +1,54 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <device.h>
+#include <functional>
+#include <tfhe.h>
+
+typedef struct Seed {
+  uint64_t lo;
+  uint64_t hi;
+} Seed;
+
+void init_seed(Seed *seed);
+
+void shuffle_seed(Seed *seed);
+
+uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
+                              int number_of_inputs, const unsigned repetitions,
+                              const unsigned samples);
+
+uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
+                                    int message_modulus, int carry_modulus,
+                                    std::function<uint64_t(uint64_t)> func);
+
+void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
+                              Seed *seed, const unsigned repetitions);
+
+void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
+                               int polynomial_size, Seed *seed,
+                               const unsigned repetitions);
+
+void generate_lwe_bootstrap_keys(cuda_stream_t *stream,
+                                 double **d_fourier_bsk_array,
+                                 uint64_t *lwe_sk_in_array,
+                                 uint64_t *lwe_sk_out_array, int lwe_dimension,
+                                 int glwe_dimension, int polynomial_size,
+                                 int pbs_level, int pbs_base_log, Seed *seed,
+                                 DynamicDistribution noise_distribution,
+                                 const unsigned repetitions);
+
+void generate_lwe_multi_bit_pbs_keys(
+    cuda_stream_t *stream, uint64_t **d_bsk_array, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, int lwe_dimension, int glwe_dimension,
+    int polynomial_size, int pbs_level, int pbs_base_log, int grouping_factor,
+    Seed *seed, DynamicDistribution noise_distribution,
+    const unsigned repetitions);
+
+void generate_lwe_keyswitch_keys(
+    cuda_stream_t *stream, uint64_t **d_ksk_array, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, int input_lwe_dimension,
+    int output_lwe_dimension, int ksk_level, int ksk_base_log, Seed *seed,
+    DynamicDistribution noise_distribution, const unsigned repetitions);
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/setup_and_teardown.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/setup_and_teardown.cpp
@@ -0,0 +1,438 @@
+#include <cmath>
+#include <random>
+#include <setup_and_teardown.h>
+
+void bootstrap_classical_setup(
+    cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
+    uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
+    uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
+    uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
+    uint64_t **d_lwe_input_indexes, uint64_t **d_lwe_ct_out_array,
+    uint64_t **d_lwe_output_indexes, int lwe_dimension, int glwe_dimension,
+    int polynomial_size, DynamicDistribution lwe_noise_distribution,
+    DynamicDistribution glwe_noise_distribution, int pbs_base_log,
+    int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
+    uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
+
+  *payload_modulus = message_modulus * carry_modulus;
+  // Value of the shift we multiply our messages by
+  *delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
+
+  // Generate the keys
+  shuffle_seed(seed);
+  generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, seed, repetitions);
+  shuffle_seed(seed);
+  generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
+                           seed, repetitions);
+  shuffle_seed(seed);
+  generate_lwe_bootstrap_keys(stream, d_fourier_bsk_array, *lwe_sk_in_array,
+                              *lwe_sk_out_array, lwe_dimension, glwe_dimension,
+                              polynomial_size, pbs_level, pbs_base_log, seed,
+                              glwe_noise_distribution, repetitions);
+  shuffle_seed(seed);
+  *plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
+                                    repetitions, samples);
+
+  // Create the LUT
+  uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
+      polynomial_size, glwe_dimension, message_modulus, carry_modulus,
+      [](int x) -> int { return x; });
+  uint64_t *lwe_ct_in_array =
+      (uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
+                         samples * sizeof(uint64_t));
+  // Create the input/output ciphertexts
+  for (int r = 0; r < repetitions; r++) {
+    uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
+    for (int s = 0; s < samples; s++) {
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
+                                           s * number_of_inputs + i];
+        uint64_t *lwe_ct_in =
+            lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
+                                           s * number_of_inputs + i) *
+                                          (lwe_dimension + 1));
+        core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in, lwe_dimension,
+                                lwe_noise_distribution, seed->lo, seed->hi);
+        shuffle_seed(seed);
+      }
+    }
+  }
+
+  // Initialize and copy things in/to the device
+  *d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
+      (glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream);
+  cuda_memcpy_async_to_gpu(
+      *d_lut_pbs_identity, lut_pbs_identity,
+      polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t), stream);
+  *d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
+                    stream);
+
+  // Input and output LWEs
+  *d_lwe_ct_out_array =
+      (uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
+                                        number_of_inputs * sizeof(uint64_t),
+                                    stream);
+  *d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
+      (lwe_dimension + 1) * number_of_inputs * repetitions * samples *
+          sizeof(uint64_t),
+      stream);
+
+  cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
+                           repetitions * samples * number_of_inputs *
+                               (lwe_dimension + 1) * sizeof(uint64_t),
+                           stream);
+
+  uint64_t *h_lwe_indexes =
+      (uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
+  *d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  *d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  for (int i = 0; i < number_of_inputs; i++)
+    h_lwe_indexes[i] = i;
+  cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
+                           number_of_inputs * sizeof(uint64_t), stream);
+  cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
+                           number_of_inputs * sizeof(uint64_t), stream);
+
+  stream->synchronize();
+
+  free(lwe_ct_in_array);
+  free(lut_pbs_identity);
+  free(h_lwe_indexes);
+}
+
+void bootstrap_classical_teardown(
+    cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
+    uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
+    uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
+    uint64_t *d_lwe_input_indexes, uint64_t *d_lwe_ct_out_array,
+    uint64_t *d_lwe_output_indexes) {
+  cuda_synchronize_stream(stream);
+
+  free(lwe_sk_in_array);
+  free(lwe_sk_out_array);
+  free(plaintexts);
+
+  cuda_drop_async(d_fourier_bsk_array, stream);
+  cuda_drop_async(d_lut_pbs_identity, stream);
+  cuda_drop_async(d_lut_pbs_indexes, stream);
+  cuda_drop_async(d_lwe_ct_in_array, stream);
+  cuda_drop_async(d_lwe_ct_out_array, stream);
+  cuda_drop_async(d_lwe_input_indexes, stream);
+  cuda_drop_async(d_lwe_output_indexes, stream);
+  stream->synchronize();
+  stream->release();
+}
+
+void bootstrap_multibit_setup(
+    cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
+    uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
+    uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
+    uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
+    uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
+    int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
+    int polynomial_size, int grouping_factor,
+    DynamicDistribution lwe_noise_distribution,
+    DynamicDistribution glwe_noise_distribution, int pbs_base_log,
+    int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
+    uint64_t *delta, int number_of_inputs, int repetitions, int samples,
+    int lwe_chunk_size) {
+  cudaSetDevice(stream->gpu_index);
+
+  *payload_modulus = message_modulus * carry_modulus;
+  // Value of the shift we multiply our messages by
+  *delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
+
+  // Generate the keys
+  shuffle_seed(seed);
+  generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, seed, repetitions);
+  shuffle_seed(seed);
+  generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
+                           seed, repetitions);
+  shuffle_seed(seed);
+  generate_lwe_multi_bit_pbs_keys(
+      stream, d_bsk_array, *lwe_sk_in_array, *lwe_sk_out_array, lwe_dimension,
+      glwe_dimension, polynomial_size, grouping_factor, pbs_level, pbs_base_log,
+      seed, glwe_noise_distribution, repetitions);
+  shuffle_seed(seed);
+
+  *plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
+                                    repetitions, samples);
+
+  // Create the LUT
+  uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
+      polynomial_size, glwe_dimension, message_modulus, carry_modulus,
+      [](int x) -> int { return x; });
+  uint64_t *lwe_ct_in_array =
+      (uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
+                         samples * sizeof(uint64_t));
+  // Create the input/output ciphertexts
+  for (int r = 0; r < repetitions; r++) {
+    uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
+    for (int s = 0; s < samples; s++) {
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
+                                           s * number_of_inputs + i];
+        uint64_t *lwe_ct_in =
+            lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
+                                           s * number_of_inputs + i) *
+                                          (lwe_dimension + 1));
+        core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in, lwe_dimension,
+                                lwe_noise_distribution, seed->lo, seed->hi);
+        shuffle_seed(seed);
+      }
+    }
+  }
+
+  // Initialize and copy things in/to the device
+  *d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
+      (glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream);
+  cuda_memcpy_async_to_gpu(
+      *d_lut_pbs_identity, lut_pbs_identity,
+      polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t), stream);
+  *d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
+                    stream);
+
+  // Input and output LWEs
+  *d_lwe_ct_out_array =
+      (uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
+                                        number_of_inputs * sizeof(uint64_t),
+                                    stream);
+  *d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
+      (lwe_dimension + 1) * number_of_inputs * repetitions * samples *
+          sizeof(uint64_t),
+      stream);
+
+  cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
+                           repetitions * samples * number_of_inputs *
+                               (lwe_dimension + 1) * sizeof(uint64_t),
+                           stream);
+
+  uint64_t *h_lwe_indexes =
+      (uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
+  *d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  *d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  for (int i = 0; i < number_of_inputs; i++)
+    h_lwe_indexes[i] = i;
+  cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
+                           number_of_inputs * sizeof(uint64_t), stream);
+  cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
+                           number_of_inputs * sizeof(uint64_t), stream);
+
+  scratch_cuda_multi_bit_pbs_64(
+      stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+      pbs_level, grouping_factor, number_of_inputs,
+      cuda_get_max_shared_memory(stream->gpu_index), true, lwe_chunk_size);
+
+  stream->synchronize();
+
+  free(h_lwe_indexes);
+  free(lut_pbs_identity);
+  free(lwe_ct_in_array);
+}
+
+void bootstrap_multibit_teardown(
+    cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
+    uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
+    uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_input_indexes,
+    uint64_t *d_lwe_ct_out_array, uint64_t *d_lwe_output_indexes) {
+  cuda_synchronize_stream(stream);
+
+  free(lwe_sk_in_array);
+  free(lwe_sk_out_array);
+  free(plaintexts);
+
+  cuda_drop_async(d_bsk_array, stream);
+  cuda_drop_async(d_lut_pbs_identity, stream);
+  cuda_drop_async(d_lut_pbs_indexes, stream);
+  cuda_drop_async(d_lwe_ct_in_array, stream);
+  cuda_drop_async(d_lwe_ct_out_array, stream);
+  cuda_drop_async(d_lwe_input_indexes, stream);
+  cuda_drop_async(d_lwe_output_indexes, stream);
+  stream->synchronize();
+  stream->release();
+}
+
+void keyswitch_setup(
+    cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
+    uint64_t **lwe_sk_out_array, uint64_t **d_ksk_array, uint64_t **plaintexts,
+    uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
+    uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
+    int input_lwe_dimension, int output_lwe_dimension,
+    DynamicDistribution lwe_noise_distribution, int ksk_base_log, int ksk_level,
+    int message_modulus, int carry_modulus, int *payload_modulus,
+    uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
+
+  *payload_modulus = message_modulus * carry_modulus;
+  // Value of the shift we multiply our messages by
+  *delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
+
+  // Generate the keys
+  shuffle_seed(seed);
+  generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, seed,
+                           repetitions);
+  shuffle_seed(seed);
+  generate_lwe_secret_keys(lwe_sk_out_array, output_lwe_dimension, seed,
+                           repetitions);
+  shuffle_seed(seed);
+  generate_lwe_keyswitch_keys(stream, d_ksk_array, *lwe_sk_in_array,
+                              *lwe_sk_out_array, input_lwe_dimension,
+                              output_lwe_dimension, ksk_level, ksk_base_log,
+                              seed, lwe_noise_distribution, repetitions);
+  shuffle_seed(seed);
+  *plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
+                                    repetitions, samples);
+
+  *d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
+      (output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream);
+  *d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
+      (input_lwe_dimension + 1) * number_of_inputs * repetitions * samples *
+          sizeof(uint64_t),
+      stream);
+  uint64_t *lwe_ct_in_array =
+      (uint64_t *)malloc((input_lwe_dimension + 1) * number_of_inputs *
+                         repetitions * samples * sizeof(uint64_t));
+  // Create the input/output ciphertexts
+  for (int r = 0; r < repetitions; r++) {
+    uint64_t *lwe_sk_in =
+        *lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
+    for (int s = 0; s < samples; s++) {
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
+                                           s * number_of_inputs + i];
+        uint64_t *lwe_ct_in =
+            lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
+                                           s * number_of_inputs + i) *
+                                          (input_lwe_dimension + 1));
+        core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in,
+                                input_lwe_dimension, lwe_noise_distribution,
+                                seed->lo, seed->hi);
+        shuffle_seed(seed);
+      }
+    }
+  }
+  cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
+                           repetitions * samples * number_of_inputs *
+                               (input_lwe_dimension + 1) * sizeof(uint64_t),
+                           stream);
+  stream->synchronize();
+
+  uint64_t *h_lwe_indexes =
+      (uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
+  *d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  *d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
+      number_of_inputs * sizeof(uint64_t), stream);
+  for (int i = 0; i < number_of_inputs; i++)
+    h_lwe_indexes[i] = i;
+  cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
+                           number_of_inputs * sizeof(uint64_t), stream);
+  cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
+                           number_of_inputs * sizeof(uint64_t), stream);
+
+  cuda_synchronize_stream(stream);
+  free(h_lwe_indexes);
+  free(lwe_ct_in_array);
+}
+
+void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
+                        uint64_t *lwe_sk_out_array, uint64_t *d_ksk_array,
+                        uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
+                        uint64_t *d_lwe_input_indexes,
+                        uint64_t *d_lwe_ct_out_array,
+                        uint64_t *d_lwe_output_indexes) {
+  cuda_synchronize_stream(stream);
+
+  free(lwe_sk_in_array);
+  free(lwe_sk_out_array);
+  free(plaintexts);
+
+  cuda_drop_async(d_ksk_array, stream);
+  cuda_drop_async(d_lwe_ct_in_array, stream);
+  cuda_drop_async(d_lwe_ct_out_array, stream);
+  cuda_drop_async(d_lwe_input_indexes, stream);
+  cuda_drop_async(d_lwe_output_indexes, stream);
+  stream->synchronize();
+  stream->release();
+}
+
+void fft_setup(cuda_stream_t *stream, double **_poly1, double **_poly2,
+               double2 **_h_cpoly1, double2 **_h_cpoly2, double2 **_d_cpoly1,
+               double2 **_d_cpoly2, size_t polynomial_size, int samples) {
+
+  auto &poly1 = *_poly1;
+  auto &poly2 = *_poly2;
+  auto &h_cpoly1 = *_h_cpoly1;
+  auto &h_cpoly2 = *_h_cpoly2;
+  auto &d_cpoly1 = *_d_cpoly1;
+  auto &d_cpoly2 = *_d_cpoly2;
+
+  poly1 = (double *)malloc(polynomial_size * samples * sizeof(double));
+  poly2 = (double *)malloc(polynomial_size * samples * sizeof(double));
+  h_cpoly1 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2));
+  h_cpoly2 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2));
+  d_cpoly1 = (double2 *)cuda_malloc_async(
+      polynomial_size / 2 * samples * sizeof(double2), stream);
+  d_cpoly2 = (double2 *)cuda_malloc_async(
+      polynomial_size / 2 * samples * sizeof(double2), stream);
+
+  double lower_bound = -1;
+  double upper_bound = 1;
+  std::uniform_real_distribution<double> unif(lower_bound, upper_bound);
+  std::default_random_engine re;
+  // Fill test data with random values
+  for (size_t i = 0; i < polynomial_size * samples; i++) {
+    poly1[i] = unif(re);
+    poly2[i] = unif(re);
+  }
+
+  // prepare data for device
+  // compress
+  for (size_t p = 0; p < (size_t)samples; p++) {
+    auto left_cpoly = &h_cpoly1[p * polynomial_size / 2];
+    auto right_cpoly = &h_cpoly2[p * polynomial_size / 2];
+    auto left = &poly1[p * polynomial_size];
+    auto right = &poly2[p * polynomial_size];
+    for (std::size_t i = 0; i < polynomial_size / 2; ++i) {
+      left_cpoly[i].x = left[i];
+      left_cpoly[i].y = left[i + polynomial_size / 2];
+
+      right_cpoly[i].x = right[i];
+      right_cpoly[i].y = right[i + polynomial_size / 2];
+    }
+  }
+
+  // copy memory cpu->gpu
+  cuda_memcpy_async_to_gpu(d_cpoly1, h_cpoly1,
+                           polynomial_size / 2 * samples * sizeof(double2),
+                           stream);
+  cuda_memcpy_async_to_gpu(d_cpoly2, h_cpoly2,
+                           polynomial_size / 2 * samples * sizeof(double2),
+                           stream);
+  stream->synchronize();
+}
+
+void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
+                  double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
+                  double2 *d_cpoly2) {
+  stream->synchronize();
+
+  free(poly1);
+  free(poly2);
+  free(h_cpoly1);
+  free(h_cpoly2);
+
+  cuda_drop_async(d_cpoly1, stream);
+  cuda_drop_async(d_cpoly2, stream);
+  stream->synchronize();
+  stream->release();
+}
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/CMakeLists.txt
@@ -0,0 +1,81 @@
+project(test_tfhe_cuda_backend LANGUAGES CXX)
+
+# See if the minimum CUDA version is available. If not, only enable documentation building.
+set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
+include(CheckLanguage)
+# See if CUDA is available
+check_language(CUDA)
+# If so, enable CUDA to check the version.
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+endif()
+# If CUDA is not available, or the minimum version is too low do not build
+if(NOT CMAKE_CUDA_COMPILER)
+  message(FATAL_ERROR "Cuda compiler not found.")
+endif()
+
+include(FetchContent)
+FetchContent_Declare(googletest
+                     URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip)
+
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt
+    ON
+    CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
+set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
+set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
+
+if(NOT TARGET tfhe-rs)
+  ExternalProject_Add(
+    tfhe-rs
+    SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_ALWAYS 1
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ""
+    DOWNLOAD_COMMAND ""
+    BUILD_COMMAND make build_c_api
+    INSTALL_COMMAND ""
+    LOG_BUILD ON)
+endif()
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
+include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
+include_directories(${TFHE_RS_BINARY_DIR})
+include_directories(${TFHE_RS_BINARY_DIR}/deps)
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
+
+add_library(tfhe_rs_lib STATIC IMPORTED)
+add_dependencies(tfhe_rs_lib tfhe-rs)
+set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
+
+set(BINARY test_tfhe_cuda_backend)
+
+file(
+  GLOB_RECURSE TEST_SOURCES
+  LIST_DIRECTORIES false
+  test_*.cpp)
+
+add_executable(${BINARY} ${TEST_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
+
+add_test(NAME ${BINARY} COMMAND ${BINARY})
+
+set_target_properties(
+  ${BINARY}
+  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
+             CUDA_RESOLVE_DEVICE_SYMBOLS ON
+             CUDA_ARCHITECTURES native)
+target_link_libraries(${BINARY} PUBLIC GTest::gtest_main tfhe_rs_lib tfhe_cuda_backend cudart)
+
+include(GoogleTest)
+gtest_discover_tests(${BINARY})
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/README.md
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/README.md
@@ -0,0 +1,61 @@
+# test_tfhe_cuda_backend
+
+This test tool is written over GoogleTest library. It tests the correctness of basic 
+cryptographic primitives accelerated using CUDA and helps identify arithmetic flaws. 
+The output format can be adjusted according to the user's interest. 
+
+A particular function will be executed for each test case, and the result will be verified considering the expected behavior. This will be repeated for multiple encryption keys and samples per key. These can be modified by changing `REPETITIONS` and `SAMPLES` variables at the beginning of each test file.
+
+## How to Compile
+
+The first step in compiling code with CMake is to create a build directory. This directory will 
+contain all the files generated during the build process, such as object files and executables. 
+We recommend creating this directory outside of the source directory, but inside the 
+implementation folder,  to keep the source directory clean.
+
+```bash
+$ cd tfhe-rs/backends/tfhe-cuda-backend/cuda
+$ mkdir build
+$ cd build
+```
+
+Run CMake to generate the build files and then use make to compile the project.
+
+```bash
+$ cmake ..
+$ make
+```
+
+The binary will be found in
+`ctfhe-rs/backends/tfhe-cuda-backend/cuda/build/tests/src`.
+
+## How to Run Tests
+
+To run tests, you can simply execute the `test_tfhe_cuda_backend` executable with no arguments:
+
+```bash
+$ tests/src/test_tfhe_cuda_backend
+```
+
+This will run all the available tests.
+
+## How to Filter Tests
+
+You can select a subset of sets by specifying a filter for the name of the tests of interest  as 
+an argument. Only tests whose full name matches the filter will be executed.
+
+For example, to run only tests whose name starts with the word "Bootstrap", you can execute:
+
+```bash
+$ tests/src/test_tfhe_cuda_backend --gtest_filter=Bootstrap*
+```
+
+The parameter `--gtest_list_tests` can be used to list all the available tests, and a better 
+description on how to select a subset of tests can be found in
+[GoogleTest documentation](http://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests).
+
+## Conclusion
+
+With these options, you can easily verify the correctness of concrete-cuda's implementations. If 
+you have any questions or issues, please feel free to contact us.
+To learn more about GoogleTest library, please refer to the [official user guide](http://google.github.io/googletest/). 
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -0,0 +1,387 @@
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <gtest/gtest.h>
+#include <setup_and_teardown.h>
+#include <utils.h>
+
+typedef struct {
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  DynamicDistribution lwe_noise_distribution;
+  DynamicDistribution glwe_noise_distribution;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus;
+  int carry_modulus;
+  int number_of_inputs;
+  int repetitions;
+  int samples;
+} ClassicalBootstrapTestParams;
+
+class ClassicalBootstrapTestPrimitives_u64
+    : public ::testing::TestWithParam<ClassicalBootstrapTestParams> {
+protected:
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  DynamicDistribution lwe_noise_distribution;
+  DynamicDistribution glwe_noise_distribution;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus;
+  int carry_modulus;
+  int payload_modulus;
+  int number_of_inputs;
+  int repetitions;
+  int samples;
+  uint64_t delta;
+  cuda_stream_t *stream;
+  int gpu_index = 0;
+  uint64_t *lwe_sk_in_array;
+  uint64_t *lwe_sk_out_array;
+  uint64_t *plaintexts;
+  double *d_fourier_bsk_array;
+  uint64_t *d_lut_pbs_identity;
+  uint64_t *d_lut_pbs_indexes;
+  uint64_t *d_lwe_ct_in_array;
+  uint64_t *d_lwe_ct_out_array;
+  uint64_t *d_lwe_input_indexes;
+  uint64_t *d_lwe_output_indexes;
+  uint64_t *lwe_ct_out_array;
+
+public:
+  // Test arithmetic functions
+  void SetUp() {
+    stream = cuda_create_stream(gpu_index);
+
+    // TestParams
+    lwe_dimension = (int)GetParam().lwe_dimension;
+    glwe_dimension = (int)GetParam().glwe_dimension;
+    polynomial_size = (int)GetParam().polynomial_size;
+    lwe_noise_distribution =
+        (DynamicDistribution)GetParam().lwe_noise_distribution;
+    glwe_noise_distribution =
+        (DynamicDistribution)GetParam().glwe_noise_distribution;
+    pbs_base_log = (int)GetParam().pbs_base_log;
+    pbs_level = (int)GetParam().pbs_level;
+    message_modulus = (int)GetParam().message_modulus;
+    carry_modulus = (int)GetParam().carry_modulus;
+    number_of_inputs = (int)GetParam().number_of_inputs;
+    repetitions = (int)GetParam().repetitions;
+    samples = (int)GetParam().samples;
+
+    Seed seed;
+    init_seed(&seed);
+
+    bootstrap_classical_setup(
+        stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
+        &d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
+        &d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_input_indexes,
+        &d_lwe_ct_out_array, &d_lwe_output_indexes, lwe_dimension,
+        glwe_dimension, polynomial_size, lwe_noise_distribution,
+        glwe_noise_distribution, pbs_base_log, pbs_level, message_modulus,
+        carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
+        samples);
+
+    lwe_ct_out_array =
+        (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
+                           number_of_inputs * sizeof(uint64_t));
+  }
+
+  void TearDown() {
+    free(lwe_ct_out_array);
+    bootstrap_classical_teardown(
+        stream, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
+        plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
+        d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
+  }
+};
+
+TEST_P(ClassicalBootstrapTestPrimitives_u64, amortized_bootstrap) {
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_amortized_64(
+      stream, &pbs_buffer, glwe_dimension, polynomial_size, number_of_inputs,
+      cuda_get_max_shared_memory(gpu_index), true);
+
+  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
+                 polynomial_size * (lwe_dimension + 1);
+  // Here execute the PBS
+  for (int r = 0; r < repetitions; r++) {
+    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
+    uint64_t *lwe_sk_out =
+        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
+    for (int s = 0; s < samples; s++) {
+      uint64_t *d_lwe_ct_in =
+          d_lwe_ct_in_array +
+          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
+                      (lwe_dimension + 1));
+      // Execute PBS
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+          stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
+          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
+          (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes,
+          (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
+          cuda_get_max_shared_memory(gpu_index));
+      // Copy result back
+      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
+                               (glwe_dimension * polynomial_size + 1) *
+                                   number_of_inputs * sizeof(uint64_t),
+                               stream);
+
+      for (int j = 0; j < number_of_inputs; j++) {
+        uint64_t *result =
+            lwe_ct_out_array +
+            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
+        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
+                                        s * number_of_inputs + j];
+        uint64_t decrypted = 0;
+        core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
+                                glwe_dimension * polynomial_size);
+        EXPECT_NE(decrypted, plaintext);
+        // let err = (decrypted >= plaintext) ? decrypted - plaintext :
+        // plaintext
+        // - decrypted;
+        // error_sample_vec.push(err);
+
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, plaintext / delta)
+            << "Repetition: " << r << ", sample: " << s;
+      }
+    }
+  }
+  cleanup_cuda_bootstrap_amortized(stream, &pbs_buffer);
+}
+
+TEST_P(ClassicalBootstrapTestPrimitives_u64, low_latency_bootstrap) {
+  int8_t *pbs_buffer;
+  scratch_cuda_bootstrap_low_latency_64(
+      stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
+      number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
+
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
+                 polynomial_size * (lwe_dimension + 1);
+  // Here execute the PBS
+  for (int r = 0; r < repetitions; r++) {
+    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
+    uint64_t *lwe_sk_out =
+        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
+    for (int s = 0; s < samples; s++) {
+      uint64_t *d_lwe_ct_in =
+          d_lwe_ct_in_array +
+          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
+                      (lwe_dimension + 1));
+      // Execute PBS
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+          stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
+          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
+          (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes,
+          (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
+          cuda_get_max_shared_memory(gpu_index));
+      // Copy result back
+      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
+                               (glwe_dimension * polynomial_size + 1) *
+                                   number_of_inputs * sizeof(uint64_t),
+                               stream);
+
+      for (int j = 0; j < number_of_inputs; j++) {
+        uint64_t *result =
+            lwe_ct_out_array +
+            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
+        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
+                                        s * number_of_inputs + j];
+        uint64_t decrypted = 0;
+        core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
+                                glwe_dimension * polynomial_size);
+        EXPECT_NE(decrypted, plaintext);
+        // let err = (decrypted >= plaintext) ? decrypted - plaintext :
+        // plaintext
+        // - decrypted;
+        // error_sample_vec.push(err);
+
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, plaintext / delta);
+      }
+    }
+  }
+  cleanup_cuda_bootstrap_low_latency_64(stream, &pbs_buffer);
+}
+
+// Defines for which parameters set the PBS will be tested.
+// It executes each src for all pairs on phis X qs (Cartesian product)
+::testing::internal::ParamGenerator<ClassicalBootstrapTestParams>
+    pbs_params_u64 = ::testing::Values(
+        // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
+        // message_modulus, carry_modulus, number_of_inputs, repetitions,
+        // samples
+        // BOOLEAN_DEFAULT_PARAMETERS
+        (ClassicalBootstrapTestParams){
+            777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
+            2, 2, 2, 40},
+        // BOOLEAN_TFHE_LIB_PARAMETERS
+        (ClassicalBootstrapTestParams){
+            830, 2, 1024,
+            new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
+            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
+            2, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
+        (ClassicalBootstrapTestParams){
+            678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
+            new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
+            1, 2, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
+        (ClassicalBootstrapTestParams){
+            684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
+            2, 2, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
+        (ClassicalBootstrapTestParams){
+            656, 2, 512,
+            new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
+            new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
+            2, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
+        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
+        (ClassicalBootstrapTestParams){
+            742, 2, 1024,
+            new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
+            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
+            2, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
+        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
+        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
+        (ClassicalBootstrapTestParams){
+            745, 1, 2048,
+            new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
+            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
+            2, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
+        (ClassicalBootstrapTestParams){
+            807, 1, 4096,
+            new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
+            2, 1, 40},
+        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
+        (ClassicalBootstrapTestParams){
+            915, 1, 8192,
+            new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
+            2, 1, 5},
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
+        (ClassicalBootstrapTestParams){
+            864, 1, 8192,
+            new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
+            2, 1, 5},
+        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
+        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
+        (ClassicalBootstrapTestParams){
+            930, 1, 16384,
+            new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
+            1, 2, 1, 5},
+
+        // BOOLEAN_DEFAULT_PARAMETERS
+        (ClassicalBootstrapTestParams){
+            777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
+            2, 100, 2, 40},
+        // BOOLEAN_TFHE_LIB_PARAMETERS
+        (ClassicalBootstrapTestParams){
+            830, 2, 1024,
+            new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
+            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
+            100, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
+        (ClassicalBootstrapTestParams){
+            678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
+            new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
+            1, 100, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
+        (ClassicalBootstrapTestParams){
+            684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
+            2, 100, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
+        (ClassicalBootstrapTestParams){
+            656, 2, 512,
+            new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
+            new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
+            100, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
+        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
+        (ClassicalBootstrapTestParams){
+            742, 2, 1024,
+            new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
+            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
+            100, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
+        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
+        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
+        (ClassicalBootstrapTestParams){
+            745, 1, 2048,
+            new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
+            new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
+            100, 2, 40},
+        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
+        (ClassicalBootstrapTestParams){
+            807, 1, 4096,
+            new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
+            100, 1, 40},
+        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
+        (ClassicalBootstrapTestParams){
+            915, 1, 8192,
+            new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
+            100, 1, 5},
+        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
+        (ClassicalBootstrapTestParams){
+            864, 1, 8192,
+            new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
+            100, 1, 5},
+        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
+        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
+        (ClassicalBootstrapTestParams){
+            930, 1, 16384,
+            new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
+            new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
+            1, 100, 1, 5});
+std::string
+printParamName(::testing::TestParamInfo<ClassicalBootstrapTestParams> p) {
+  ClassicalBootstrapTestParams params = p.param;
+
+  return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
+         std::to_string(params.glwe_dimension) + "_N_" +
+         std::to_string(params.polynomial_size) + "_pbs_base_log_" +
+         std::to_string(params.pbs_base_log) + "_pbs_level_" +
+         std::to_string(params.pbs_level) + "_number_of_inputs_" +
+         std::to_string(params.number_of_inputs);
+}
+
+INSTANTIATE_TEST_CASE_P(ClassicalBootstrapInstantiation,
+                        ClassicalBootstrapTestPrimitives_u64, pbs_params_u64,
+                        printParamName);
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_fft.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_fft.cpp
@@ -0,0 +1,127 @@
+#include "utils.h"
+#include "gtest/gtest.h"
+#include <bootstrap.h>
+#include <cstdint>
+#include <device.h>
+#include <functional>
+#include <random>
+#include <setup_and_teardown.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct {
+  size_t polynomial_size;
+  int samples;
+} FourierTransformTestParams;
+
+class FourierTransformTestPrimitives_u64
+    : public ::testing::TestWithParam<FourierTransformTestParams> {
+protected:
+  size_t polynomial_size;
+  int samples;
+  cuda_stream_t *stream;
+  int gpu_index = 0;
+
+  double *poly1;
+  double *poly2; // will be used as extracted result for cuda mult
+  double *poly_exp_result;
+  double2 *h_cpoly1;
+  double2 *h_cpoly2; // will be used as a result poly
+  double2 *d_cpoly1;
+  double2 *d_cpoly2; // will be used as a result poly
+
+public:
+  void SetUp() {
+    stream = cuda_create_stream(gpu_index);
+
+    // get src params
+    polynomial_size = (int)GetParam().polynomial_size;
+    samples = (int)GetParam().samples;
+
+    fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
+              &d_cpoly2, polynomial_size, samples);
+
+    // allocate memory
+    poly_exp_result =
+        (double *)malloc(polynomial_size * 2 * samples * sizeof(double));
+    memset(poly_exp_result, 0., polynomial_size * 2 * samples * sizeof(double));
+
+    // execute school book multiplication
+    for (size_t p = 0; p < (size_t)samples; p++) {
+      auto left = &poly1[p * polynomial_size];
+      auto right = &poly2[p * polynomial_size];
+      auto res = &poly_exp_result[p * polynomial_size * 2];
+
+      // multiplication
+      for (std::size_t i = 0; i < polynomial_size; ++i) {
+        for (std::size_t j = 0; j < polynomial_size; ++j) {
+          res[i + j] += left[i] * right[j];
+        }
+      }
+
+      // make result negacyclic
+      for (size_t i = 0; i < polynomial_size; i++) {
+        res[i] = res[i] - res[i + polynomial_size];
+      }
+    }
+  }
+
+  void TearDown() {
+    fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2);
+    free(poly_exp_result);
+  }
+};
+
+TEST_P(FourierTransformTestPrimitives_u64, cuda_fft_mult) {
+
+  int r = 0;
+  auto cur_input1 = &d_cpoly1[r * polynomial_size / 2 * samples];
+  auto cur_input2 = &d_cpoly2[r * polynomial_size / 2 * samples];
+  auto cur_h_c_res = &h_cpoly2[r * polynomial_size / 2 * samples];
+  auto cur_poly2 = &poly2[r * polynomial_size * samples];
+  auto cur_expected = &poly_exp_result[r * polynomial_size * 2 * samples];
+
+  cuda_fourier_polynomial_mul(cur_input1, cur_input2, cur_input2, stream,
+                              polynomial_size, samples);
+
+  cuda_memcpy_async_to_cpu(cur_h_c_res, cur_input2,
+                           polynomial_size / 2 * samples * sizeof(double2),
+                           stream);
+  cuda_synchronize_stream(stream);
+
+  for (int p = 0; p < samples; p++) {
+    for (size_t i = 0; i < (size_t)polynomial_size / 2; i++) {
+      cur_poly2[p * polynomial_size + i] =
+          cur_h_c_res[p * polynomial_size / 2 + i].x;
+      cur_poly2[p * polynomial_size + i + polynomial_size / 2] =
+          cur_h_c_res[p * polynomial_size / 2 + i].y;
+    }
+  }
+
+  for (size_t p = 0; p < (size_t)samples; p++) {
+    for (size_t i = 0; i < (size_t)polynomial_size; i++) {
+      EXPECT_NEAR(cur_poly2[p * polynomial_size + i],
+                  cur_expected[p * 2 * polynomial_size + i], 1e-9);
+    }
+  }
+}
+
+::testing::internal::ParamGenerator<FourierTransformTestParams> fft_params_u64 =
+    ::testing::Values((FourierTransformTestParams){256, 100},
+                      (FourierTransformTestParams){512, 100},
+                      (FourierTransformTestParams){1024, 100},
+                      (FourierTransformTestParams){2048, 100},
+                      (FourierTransformTestParams){4096, 100},
+                      (FourierTransformTestParams){8192, 50},
+                      (FourierTransformTestParams){16384, 10});
+
+std::string
+printParamName(::testing::TestParamInfo<FourierTransformTestParams> p) {
+  FourierTransformTestParams params = p.param;
+
+  return "N_" + std::to_string(params.polynomial_size) + "_samples_" +
+         std::to_string(params.samples);
+}
+
+INSTANTIATE_TEST_CASE_P(fftInstantiation, FourierTransformTestPrimitives_u64,
+                        fft_params_u64, printParamName);
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_keyswitch.cpp
@@ -0,0 +1,162 @@
+#include <cmath>
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <setup_and_teardown.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+const unsigned REPETITIONS = 2;
+const unsigned SAMPLES = 50;
+
+typedef struct {
+  int input_lwe_dimension;
+  int output_lwe_dimension;
+  DynamicDistribution noise_distribution;
+  int ksk_base_log;
+  int ksk_level;
+  int message_modulus;
+  int carry_modulus;
+  int number_of_inputs;
+} KeyswitchTestParams;
+
+class KeyswitchTestPrimitives_u64
+    : public ::testing::TestWithParam<KeyswitchTestParams> {
+protected:
+  int input_lwe_dimension;
+  int output_lwe_dimension;
+  DynamicDistribution noise_distribution;
+  int ksk_base_log;
+  int ksk_level;
+  int message_modulus;
+  int carry_modulus;
+  int number_of_inputs;
+  int payload_modulus;
+  uint64_t delta;
+  cuda_stream_t *stream;
+  int gpu_index = 0;
+  uint64_t *lwe_sk_in_array;
+  uint64_t *lwe_sk_out_array;
+  uint64_t *plaintexts;
+  uint64_t *d_ksk_array;
+  uint64_t *d_lwe_ct_out_array;
+  uint64_t *d_lwe_ct_in_array;
+  uint64_t *lwe_in_ct;
+  uint64_t *lwe_out_ct;
+  uint64_t *lwe_input_indexes;
+  uint64_t *lwe_output_indexes;
+
+public:
+  // Test arithmetic functions
+  void SetUp() {
+    stream = cuda_create_stream(gpu_index);
+
+    // TestParams
+    input_lwe_dimension = (int)GetParam().input_lwe_dimension;
+    output_lwe_dimension = (int)GetParam().output_lwe_dimension;
+    noise_distribution = (DynamicDistribution)GetParam().noise_distribution;
+    ksk_base_log = (int)GetParam().ksk_base_log;
+    ksk_level = (int)GetParam().ksk_level;
+    message_modulus = (int)GetParam().message_modulus;
+    carry_modulus = (int)GetParam().carry_modulus;
+    number_of_inputs = (int)GetParam().number_of_inputs;
+
+    Seed seed;
+    init_seed(&seed);
+
+    keyswitch_setup(stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
+                    &d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
+                    &lwe_input_indexes, &d_lwe_ct_out_array,
+                    &lwe_output_indexes, input_lwe_dimension,
+                    output_lwe_dimension, noise_distribution, ksk_base_log,
+                    ksk_level, message_modulus, carry_modulus, &payload_modulus,
+                    &delta, number_of_inputs, REPETITIONS, SAMPLES);
+  }
+
+  void TearDown() {
+    keyswitch_teardown(stream, lwe_sk_in_array, lwe_sk_out_array, d_ksk_array,
+                       plaintexts, d_lwe_ct_in_array, lwe_input_indexes,
+                       d_lwe_ct_out_array, lwe_output_indexes);
+  }
+};
+
+TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
+  uint64_t *lwe_out_ct = (uint64_t *)malloc(
+      (output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t));
+  for (uint r = 0; r < REPETITIONS; r++) {
+    uint64_t *lwe_out_sk =
+        lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
+    int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
+    uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
+    for (uint s = 0; s < SAMPLES; s++) {
+      uint64_t *d_lwe_ct_in =
+          d_lwe_ct_in_array +
+          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
+                      (input_lwe_dimension + 1));
+      // Execute keyswitch
+      cuda_keyswitch_lwe_ciphertext_vector_64(
+          stream, (void *)d_lwe_ct_out_array, (void *)lwe_output_indexes,
+          (void *)d_lwe_ct_in, (void *)lwe_input_indexes, (void *)d_ksk,
+          input_lwe_dimension, output_lwe_dimension, ksk_base_log, ksk_level,
+          number_of_inputs);
+
+      // Copy result back
+      cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array,
+                               number_of_inputs * (output_lwe_dimension + 1) *
+                                   sizeof(uint64_t),
+                               stream);
+      for (int i = 0; i < number_of_inputs; i++) {
+        uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
+                                        s * number_of_inputs + i];
+        uint64_t decrypted = 0;
+        core_crypto_lwe_decrypt(&decrypted,
+                                lwe_out_ct + i * (output_lwe_dimension + 1),
+                                lwe_out_sk, output_lwe_dimension);
+        EXPECT_NE(decrypted, plaintext);
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, plaintext / delta);
+      }
+    }
+  }
+  free(lwe_out_ct);
+}
+
+// Defines for which parameters set the PBS will be tested.
+// It executes each src for all pairs on phis X qs (Cartesian product)
+::testing::internal::ParamGenerator<KeyswitchTestParams> ksk_params_u64 =
+    ::testing::Values(
+        // n, k*N, noise_distribution, ks_base_log, ks_level,
+        // message_modulus, carry_modulus, number_of_inputs
+        (KeyswitchTestParams){
+            567, 1280, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
+            3, 3, 2, 1, 10},
+        (KeyswitchTestParams){
+            694, 1536, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
+            4, 3, 2, 1, 10},
+        (KeyswitchTestParams){
+            769, 2048, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
+            4, 3, 2, 1, 10},
+        (KeyswitchTestParams){
+            754, 2048, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
+            3, 5, 2, 1, 10},
+        (KeyswitchTestParams){742, 2048,
+                              new_gaussian_from_std_dev(sqrt(4.9982771e-11)), 3,
+                              5, 4, 1, 10},
+        (KeyswitchTestParams){
+            847, 4096, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
+            4, 4, 2, 1, 10});
+
+std::string printParamName(::testing::TestParamInfo<KeyswitchTestParams> p) {
+  KeyswitchTestParams params = p.param;
+
+  return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" +
+         std::to_string(params.output_lwe_dimension) + "_baselog_" +
+         std::to_string(params.ksk_base_log) + "_ksk_level_" +
+         std::to_string(params.ksk_level);
+}
+
+INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64,
+                        ksk_params_u64, printParamName);
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -0,0 +1,215 @@
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <gtest/gtest.h>
+#include <setup_and_teardown.h>
+#include <utils.h>
+
+typedef struct {
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  DynamicDistribution lwe_noise_distribution;
+  DynamicDistribution glwe_noise_distribution;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus;
+  int carry_modulus;
+  int number_of_inputs;
+  int grouping_factor;
+  int repetitions;
+  int samples;
+} MultiBitBootstrapTestParams;
+
+class MultiBitBootstrapTestPrimitives_u64
+    : public ::testing::TestWithParam<MultiBitBootstrapTestParams> {
+protected:
+  int lwe_dimension;
+  int glwe_dimension;
+  int polynomial_size;
+  DynamicDistribution lwe_noise_distribution;
+  DynamicDistribution glwe_noise_distribution;
+  int pbs_base_log;
+  int pbs_level;
+  int message_modulus;
+  int carry_modulus;
+  int payload_modulus;
+  int number_of_inputs;
+  int grouping_factor;
+  uint64_t delta;
+  cuda_stream_t *stream;
+  int gpu_index = 0;
+  uint64_t *lwe_sk_in_array;
+  uint64_t *lwe_sk_out_array;
+  uint64_t *plaintexts;
+  uint64_t *d_bsk_array;
+  uint64_t *d_lut_pbs_identity;
+  uint64_t *d_lut_pbs_indexes;
+  uint64_t *d_lwe_ct_in_array;
+  uint64_t *d_lwe_ct_out_array;
+  uint64_t *lwe_ct_out_array;
+  uint64_t *d_lwe_input_indexes;
+  uint64_t *d_lwe_output_indexes;
+  int8_t *pbs_buffer;
+
+  int repetitions;
+  int samples;
+
+public:
+  void SetUp() {
+    stream = cuda_create_stream(gpu_index);
+
+    // TestParams
+    lwe_dimension = (int)GetParam().lwe_dimension;
+    glwe_dimension = (int)GetParam().glwe_dimension;
+    polynomial_size = (int)GetParam().polynomial_size;
+    grouping_factor = (int)GetParam().grouping_factor;
+    lwe_noise_distribution =
+        (DynamicDistribution)GetParam().lwe_noise_distribution;
+    glwe_noise_distribution =
+        (DynamicDistribution)GetParam().glwe_noise_distribution;
+    pbs_base_log = (int)GetParam().pbs_base_log;
+    pbs_level = (int)GetParam().pbs_level;
+    message_modulus = (int)GetParam().message_modulus;
+    carry_modulus = (int)GetParam().carry_modulus;
+    number_of_inputs = (int)GetParam().number_of_inputs;
+
+    Seed seed;
+    init_seed(&seed);
+
+    repetitions = (int)GetParam().repetitions;
+    samples = (int)GetParam().samples;
+
+    bootstrap_multibit_setup(
+        stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array,
+        &plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
+        &d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array,
+        &d_lwe_output_indexes, &pbs_buffer, lwe_dimension, glwe_dimension,
+        polynomial_size, grouping_factor, lwe_noise_distribution,
+        glwe_noise_distribution, pbs_base_log, pbs_level, message_modulus,
+        carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
+        samples);
+
+    lwe_ct_out_array =
+        (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
+                           number_of_inputs * sizeof(uint64_t));
+  }
+
+  void TearDown() {
+    free(lwe_ct_out_array);
+
+    cleanup_cuda_multi_bit_pbs_64(stream, &pbs_buffer);
+    bootstrap_multibit_teardown(
+        stream, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array, plaintexts,
+        d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
+        d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
+  }
+};
+
+TEST_P(MultiBitBootstrapTestPrimitives_u64, multi_bit_pbs) {
+
+  int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
+                 (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
+                 (1 << grouping_factor);
+
+  for (int r = 0; r < repetitions; r++) {
+    uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
+    uint64_t *lwe_sk_out =
+        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
+    for (int s = 0; s < samples; s++) {
+      uint64_t *d_lwe_ct_in =
+          d_lwe_ct_in_array +
+          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
+                      (lwe_dimension + 1));
+      // Execute PBS
+      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+          stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
+          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
+          (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_bsk,
+          pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          grouping_factor, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
+          cuda_get_max_shared_memory(gpu_index));
+
+      // Copy result to the host memory
+      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
+                               (glwe_dimension * polynomial_size + 1) *
+                                   number_of_inputs * sizeof(uint64_t),
+                               stream);
+
+      for (int j = 0; j < number_of_inputs; j++) {
+        uint64_t *result =
+            lwe_ct_out_array +
+            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
+        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
+                                        s * number_of_inputs + j];
+        uint64_t decrypted = 0;
+        core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
+                                glwe_dimension * polynomial_size);
+
+        EXPECT_NE(decrypted, plaintext)
+            << "Repetition: " << r << ", sample: " << s << ", input: " << j;
+
+        // The bit before the message
+        uint64_t rounding_bit = delta >> 1;
+
+        // Compute the rounding bit
+        uint64_t rounding = (decrypted & rounding_bit) << 1;
+        uint64_t decoded = (decrypted + rounding) / delta;
+        EXPECT_EQ(decoded, plaintext / delta)
+            << "Repetition: " << r << ", sample: " << s << ", input: " << j;
+      }
+    }
+  }
+}
+
+// Defines for which parameters set the PBS will be tested.
+// It executes each src for all pairs on phis X qs (Cartesian product)
+::testing::internal::ParamGenerator<MultiBitBootstrapTestParams>
+    multipbs_params_u64 = ::testing::Values(
+        // fast src
+        (MultiBitBootstrapTestParams){
+            16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
+            2, 1, 2, 1, 10},
+        (MultiBitBootstrapTestParams){
+            16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
+            2, 128, 2, 1, 10},
+        // 4_bits_multi_bit_group_2
+        (MultiBitBootstrapTestParams){
+            818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2,
+            2, 1, 2, 1, 10},
+        (MultiBitBootstrapTestParams){
+            818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)),
+            new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2,
+            2, 128, 2, 1, 10},
+        // 4_bits_multi_bit_group_3
+        (MultiBitBootstrapTestParams){
+            888, 1, 2048,
+            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
+            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
+            1, 3, 1, 10},
+        (MultiBitBootstrapTestParams){
+            888, 1, 2048,
+            new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
+            new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
+            128, 3, 1, 10});
+std::string
+printParamName(::testing::TestParamInfo<MultiBitBootstrapTestParams> p) {
+  MultiBitBootstrapTestParams params = p.param;
+
+  return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
+         std::to_string(params.glwe_dimension) + "_N_" +
+         std::to_string(params.polynomial_size) + "_pbs_base_log_" +
+         std::to_string(params.pbs_base_log) + "_pbs_level_" +
+         std::to_string(params.pbs_level) + "_grouping_factor_" +
+         std::to_string(params.grouping_factor) + "_number_of_inputs_" +
+         std::to_string(params.number_of_inputs);
+}
+
+INSTANTIATE_TEST_CASE_P(MultiBitBootstrapInstantiation,
+                        MultiBitBootstrapTestPrimitives_u64,
+                        multipbs_params_u64, printParamName);
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/utils.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/utils.cpp
@@ -0,0 +1,249 @@
+#include <algorithm>
+#include <bootstrap.h>
+#include <bootstrap_multibit.h>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <device.h>
+#include <functional>
+#include <random>
+#include <utils.h>
+
+void init_seed(Seed *seed) {
+  seed->lo = 0;
+  seed->hi = 0;
+}
+
+void shuffle_seed(Seed *seed) {
+  //  std::random_device rd;
+  //  std::mt19937 gen(rd());
+  //  std::uniform_int_distribution<unsigned long long> dis(
+  //      std::numeric_limits<std::uint64_t>::min(),
+  //      std::numeric_limits<std::uint64_t>::max());
+  //
+  //    seed.lo += dis(gen);
+  //    seed.hi += dis(gen);
+
+  // This is a more convenient solution for testing
+  seed->lo += 1;
+  seed->hi += 1;
+}
+
+// For each sample and repetition, create a plaintext
+// The payload_modulus is the message modulus times the carry modulus
+// (so the total message modulus)
+uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
+                              int number_of_inputs, const unsigned repetitions,
+                              const unsigned samples) {
+
+  uint64_t *plaintext_array = (uint64_t *)malloc(
+      repetitions * samples * number_of_inputs * sizeof(uint64_t));
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<unsigned long long> dis(
+      std::numeric_limits<std::uint64_t>::min(),
+      std::numeric_limits<std::uint64_t>::max());
+  for (uint r = 0; r < repetitions; r++) {
+    for (uint s = 0; s < samples; s++) {
+      for (int i = 0; i < number_of_inputs; i++) {
+        plaintext_array[r * samples * number_of_inputs + s * number_of_inputs +
+                        i] = (dis(gen) % payload_modulus) * delta;
+      }
+    }
+  }
+  return plaintext_array;
+}
+
+uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
+                                    int message_modulus, int carry_modulus,
+                                    std::function<uint64_t(uint64_t)> func) {
+  // Modulus of the msg contained in the msg bits and operations buffer
+  uint64_t modulus_sup = message_modulus * carry_modulus;
+
+  // N/(p/2) = size of each block
+  uint64_t box_size = polynomial_size / modulus_sup;
+
+  // Value of the shift we multiply our messages by
+  uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup);
+
+  // Create the plaintext lut_pbs
+  uint64_t *plaintext_lut_pbs =
+      (uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
+
+  // This plaintext_lut_pbs extracts the carry bits
+  for (uint64_t i = 0; i < modulus_sup; i++) {
+    uint64_t index = i * box_size;
+    for (uint64_t j = index; j < index + box_size; j++) {
+      plaintext_lut_pbs[j] = func(i) * delta;
+    }
+  }
+
+  uint64_t half_box_size = box_size / 2;
+
+  // Negate the first half_box_size coefficients
+  for (uint64_t i = 0; i < half_box_size; i++) {
+    plaintext_lut_pbs[i] = -plaintext_lut_pbs[i];
+  }
+
+  // Rotate the plaintext_lut_pbs
+  std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size,
+              plaintext_lut_pbs + polynomial_size);
+
+  // Create the GLWE lut_pbs
+  uint64_t *lut_pbs = (uint64_t *)malloc(
+      polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t));
+  for (int i = 0; i < polynomial_size * glwe_dimension; i++) {
+    lut_pbs[i] = 0;
+  }
+  for (int i = 0; i < polynomial_size; i++) {
+    int glwe_index = glwe_dimension * polynomial_size + i;
+    lut_pbs[glwe_index] = plaintext_lut_pbs[i];
+  }
+
+  free(plaintext_lut_pbs);
+  return lut_pbs;
+}
+
+// Generate repetitions LWE secret keys
+void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
+                              Seed *seed, const unsigned repetitions) {
+  *lwe_sk_array =
+      (uint64_t *)malloc(lwe_dimension * repetitions * sizeof(uint64_t));
+  int shift = 0;
+  for (uint r = 0; r < repetitions; r++) {
+    // Generate the lwe secret key for each repetition
+    core_crypto_lwe_secret_key(*lwe_sk_array + (ptrdiff_t)(shift),
+                               lwe_dimension, seed->lo, seed->hi);
+    shift += lwe_dimension;
+  }
+}
+
+// Generate repetitions GLWE secret keys
+void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
+                               int polynomial_size, Seed *seed,
+                               const unsigned repetitions) {
+  int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions;
+  *glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t));
+  int shift = 0;
+  for (uint r = 0; r < repetitions; r++) {
+    // Generate the lwe secret key for each repetition
+    core_crypto_lwe_secret_key(*glwe_sk_array + (ptrdiff_t)(shift),
+                               glwe_dimension * polynomial_size, seed->lo,
+                               seed->hi);
+    shift += glwe_dimension * polynomial_size;
+  }
+}
+
+// Generate repetitions LWE bootstrap keys
+void generate_lwe_bootstrap_keys(cuda_stream_t *stream,
+                                 double **d_fourier_bsk_array,
+                                 uint64_t *lwe_sk_in_array,
+                                 uint64_t *lwe_sk_out_array, int lwe_dimension,
+                                 int glwe_dimension, int polynomial_size,
+                                 int pbs_level, int pbs_base_log, Seed *seed,
+                                 DynamicDistribution noise_distribution,
+                                 const unsigned repetitions) {
+  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
+                 polynomial_size * (lwe_dimension + 1);
+  int bsk_array_size = bsk_size * repetitions;
+
+  uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
+  *d_fourier_bsk_array =
+      (double *)cuda_malloc_async(bsk_array_size * sizeof(double), stream);
+  int shift_in = 0;
+  int shift_out = 0;
+  int shift_bsk = 0;
+
+  for (uint r = 0; r < repetitions; r++) {
+    // Generate the bootstrap key for each repetition
+    core_crypto_par_generate_lwe_bootstrapping_key(
+        bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log, pbs_level,
+        lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
+        lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
+        polynomial_size, noise_distribution, seed->lo, seed->hi);
+    double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk);
+    uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
+    cuda_synchronize_stream(stream);
+    cuda_convert_lwe_bootstrap_key_64((void *)(d_fourier_bsk), (void *)(bsk),
+                                      stream, lwe_dimension, glwe_dimension,
+                                      pbs_level, polynomial_size);
+    shift_in += lwe_dimension;
+    shift_out += glwe_dimension * polynomial_size;
+    shift_bsk += bsk_size;
+  }
+  cuda_synchronize_stream(stream);
+  free(bsk_array);
+}
+
+void generate_lwe_multi_bit_pbs_keys(
+    cuda_stream_t *stream, uint64_t **d_bsk_array, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, int lwe_dimension, int glwe_dimension,
+    int polynomial_size, int grouping_factor, int pbs_level, int pbs_base_log,
+    Seed *seed, DynamicDistribution noise_distribution,
+    const unsigned repetitions) {
+
+  int bsk_size = lwe_dimension * pbs_level * (glwe_dimension + 1) *
+                 (glwe_dimension + 1) * polynomial_size *
+                 (1 << grouping_factor) / grouping_factor;
+  int bsk_array_size = bsk_size * repetitions;
+  uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
+
+  *d_bsk_array =
+      (uint64_t *)cuda_malloc_async(bsk_array_size * sizeof(uint64_t), stream);
+  for (uint r = 0; r < repetitions; r++) {
+    int shift_in = 0;
+    int shift_out = 0;
+    int shift_bsk = 0;
+    core_crypto_par_generate_lwe_multi_bit_bootstrapping_key(
+        lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
+        lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
+        polynomial_size, bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log,
+        pbs_level, grouping_factor, noise_distribution, 0, 0);
+    uint64_t *d_bsk = *d_bsk_array + (ptrdiff_t)(shift_bsk);
+    uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
+    cuda_convert_lwe_multi_bit_bootstrap_key_64(
+        d_bsk, bsk, stream, lwe_dimension, glwe_dimension, pbs_level,
+        polynomial_size, grouping_factor);
+    shift_in += lwe_dimension;
+    shift_out += glwe_dimension * polynomial_size;
+    shift_bsk += bsk_size;
+  }
+  cuda_synchronize_stream(stream);
+  free(bsk_array);
+}
+
+// Generate repetitions keyswitch keys
+void generate_lwe_keyswitch_keys(
+    cuda_stream_t *stream, uint64_t **d_ksk_array, uint64_t *lwe_sk_in_array,
+    uint64_t *lwe_sk_out_array, int input_lwe_dimension,
+    int output_lwe_dimension, int ksk_level, int ksk_base_log, Seed *seed,
+    DynamicDistribution noise_distribution, const unsigned repetitions) {
+
+  int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
+  int ksk_array_size = ksk_size * repetitions;
+
+  uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t));
+  *d_ksk_array =
+      (uint64_t *)cuda_malloc_async(ksk_array_size * sizeof(uint64_t), stream);
+  int shift_in = 0;
+  int shift_out = 0;
+  int shift_ksk = 0;
+
+  for (uint r = 0; r < repetitions; r++) {
+    // Generate the keyswitch key for each repetition
+    core_crypto_par_generate_lwe_keyswitch_key(
+        ksk_array + (ptrdiff_t)(shift_ksk), ksk_base_log, ksk_level,
+        lwe_sk_in_array + (ptrdiff_t)(shift_in), input_lwe_dimension,
+        lwe_sk_out_array + (ptrdiff_t)(shift_out), output_lwe_dimension,
+        noise_distribution, seed->lo, seed->hi);
+    uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk);
+    uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk);
+    cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream);
+
+    shift_in += input_lwe_dimension;
+    shift_out += output_lwe_dimension;
+    shift_ksk += ksk_size;
+  }
+  cuda_synchronize_stream(stream);
+  free(ksk_array);
+}
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -215,7 +215,7 @@ extern "C" {

    /// This cleanup function frees the data for the low latency PBS on GPU
    /// contained in pbs_buffer for 32 or 64-bit inputs.
-    pub fn cleanup_cuda_bootstrap_low_latency(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
+    pub fn cleanup_cuda_bootstrap_low_latency_64(v_stream: *const c_void, pbs_buffer: *mut *mut i8);

    /// This scratch function allocates the necessary amount of data on the GPU for
    /// the multi-bit PBS on 64-bit inputs into `pbs_buffer`.
@@ -297,7 +297,7 @@ extern "C" {

    /// This cleanup function frees the data for the multi-bit PBS on GPU
    /// contained in pbs_buffer for 64-bit inputs.
-    pub fn cleanup_cuda_multi_bit_pbs(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
+    pub fn cleanup_cuda_multi_bit_pbs_64(v_stream: *const c_void, pbs_buffer: *mut *mut i8);

    /// Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
    ///
--- a/ci/benchmark_parser.py
+++ b/ci/benchmark_parser.py
@@ -4,6 +4,7 @@ benchmark_parser

 Parse criterion benchmark or keys size results.
 """
+
 import argparse
 import csv
 import pathlib
@@ -11,45 +12,97 @@ import json
 import sys


-ONE_HOUR_IN_NANOSECONDS = 3600E9
+ONE_HOUR_IN_NANOSECONDS = 3600e9

 parser = argparse.ArgumentParser()
-parser.add_argument('results',
-                    help='Location of criterion benchmark results directory.'
-                         'If the --key-size option is used, then the value would have to point to'
-                         'a CSV file.')
-parser.add_argument('output_file', help='File storing parsed results')
-parser.add_argument('-d', '--database', dest='database',
-                    help='Name of the database used to store results')
-parser.add_argument('-w', '--hardware', dest='hardware',
-                    help='Hardware reference used to perform benchmark')
-parser.add_argument('-V', '--project-version', dest='project_version',
-                    help='Commit hash reference')
-parser.add_argument('-b', '--branch', dest='branch',
-                    help='Git branch name on which benchmark was performed')
-parser.add_argument('--commit-date', dest='commit_date',
-                    help='Timestamp of commit hash used in project_version')
-parser.add_argument('--bench-date', dest='bench_date',
-                    help='Timestamp when benchmark was run')
-parser.add_argument('--name-suffix', dest='name_suffix', default='',
-                    help='Suffix to append to each of the result test names')
-parser.add_argument('--append-results', dest='append_results', action='store_true',
-                    help='Append parsed results to an existing file')
-parser.add_argument('--walk-subdirs', dest='walk_subdirs', action='store_true',
-                    help='Check for results in subdirectories')
-parser.add_argument('--key-sizes', dest='key_sizes', action='store_true',
-                    help='Parse only the results regarding keys size measurements')
-parser.add_argument('--key-gen', dest='key_gen', action='store_true',
-                    help='Parse only the results regarding keys generation time measurements')
-parser.add_argument('--throughput', dest='throughput', action='store_true',
-                    help='Compute and append number of operations per second and'
-                         'operations per dollar')
-parser.add_argument('--backend', dest='backend', default='cpu',
-                    help='Backend on which benchmarks have run')
+parser.add_argument(
+    "results",
+    help="Location of criterion benchmark results directory."
+    "If the --key-size option is used, then the value would have to point to"
+    "a CSV file.",
+)
+parser.add_argument("output_file", help="File storing parsed results")
+parser.add_argument(
+    "-d",
+    "--database",
+    dest="database",
+    help="Name of the database used to store results",
+)
+parser.add_argument(
+    "-w",
+    "--hardware",
+    dest="hardware",
+    help="Hardware reference used to perform benchmark",
+)
+parser.add_argument(
+    "-V", "--project-version", dest="project_version", help="Commit hash reference"
+)
+parser.add_argument(
+    "-b",
+    "--branch",
+    dest="branch",
+    help="Git branch name on which benchmark was performed",
+)
+parser.add_argument(
+    "--commit-date",
+    dest="commit_date",
+    help="Timestamp of commit hash used in project_version",
+)
+parser.add_argument(
+    "--bench-date", dest="bench_date", help="Timestamp when benchmark was run"
+)
+parser.add_argument(
+    "--name-suffix",
+    dest="name_suffix",
+    default="",
+    help="Suffix to append to each of the result test names",
+)
+parser.add_argument(
+    "--append-results",
+    dest="append_results",
+    action="store_true",
+    help="Append parsed results to an existing file",
+)
+parser.add_argument(
+    "--walk-subdirs",
+    dest="walk_subdirs",
+    action="store_true",
+    help="Check for results in subdirectories",
+)
+parser.add_argument(
+    "--key-sizes",
+    dest="key_sizes",
+    action="store_true",
+    help="Parse only the results regarding keys size measurements",
+)
+parser.add_argument(
+    "--key-gen",
+    dest="key_gen",
+    action="store_true",
+    help="Parse only the results regarding keys generation time measurements",
+)
+parser.add_argument(
+    "--throughput",
+    dest="throughput",
+    action="store_true",
+    help="Compute and append number of operations per second and"
+    "operations per dollar",
+)
+parser.add_argument(
+    "--backend",
+    dest="backend",
+    default="cpu",
+    help="Backend on which benchmarks have run",
+)


-def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throughput=False,
-                    hardware_hourly_cost=None):
+def recursive_parse(
+    directory,
+    walk_subdirs=False,
+    name_suffix="",
+    compute_throughput=False,
+    hardware_hourly_cost=None,
+):
    """
    Parse all the benchmark results in a directory. It will attempt to parse all the files having a
    .json extension at the top-level of this directory.
@@ -84,7 +137,9 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu

            full_name, test_name = parse_benchmark_file(subdir)
            if test_name is None:
-                parsing_failures.append((full_name, "'function_id' field is null in report"))
+                parsing_failures.append(
+                    (full_name, "'function_id' field is null in report")
+                )
                continue

            try:
@@ -94,7 +149,9 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
                continue

            for stat_name, value in parse_estimate_file(subdir).items():
-                test_name_parts = list(filter(None, [test_name, stat_name, name_suffix]))
+                test_name_parts = list(
+                    filter(None, [test_name, stat_name, name_suffix])
+                )

                result_values.append(
                    _create_point(
@@ -104,19 +161,26 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
                        "latency",
                        operator,
                        params,
-                        display_name=display_name
+                        display_name=display_name,
                    )
                )

+                lowercase_test_name = test_name.lower()
                # This is a special case where PBS are blasted as vector LWE ciphertext with
                # variable length to saturate the machine. To get the actual throughput we need to
                # multiply by the length of the vector.
-                if "PBS_throughput" in test_name and "chunk" in test_name:
+                if (
+                    "pbs_throughput" in lowercase_test_name
+                    and lowercase_test_name.endswith("chunk")
+                ):
                    try:
-                        multiplier = int(test_name.split("chunk")[0].split("_")[-1])
+                        multiplier = int(
+                            lowercase_test_name.strip("chunk").split("::")[-1]
+                        )
                    except ValueError:
-                        parsing_failures.append((full_name,
-                                                 "failed to extract throughput multiplier"))
+                        parsing_failures.append(
+                            (full_name, "failed to extract throughput multiplier")
+                        )
                        continue
                else:
                    multiplier = 1
@@ -132,7 +196,7 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
                            "throughput",
                            operator,
                            params,
-                            display_name="_".join([display_name, test_suffix])
+                            display_name="_".join([display_name, test_suffix]),
                        )
                    )
                    test_name_parts.pop()
@@ -142,20 +206,23 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
                        test_name_parts.append(test_suffix)
                        result_values.append(
                            _create_point(
-                                multiplier * compute_ops_per_dollar(value, hardware_hourly_cost),
+                                multiplier
+                                * compute_ops_per_dollar(value, hardware_hourly_cost),
                                "_".join(test_name_parts),
                                bench_class,
                                "throughput",
                                operator,
                                params,
-                                display_name="_".join([display_name, test_suffix])
+                                display_name="_".join([display_name, test_suffix]),
                            )
                        )

    return result_values, parsing_failures


-def _create_point(value, test_name, bench_class, bench_type, operator, params, display_name=None):
+def _create_point(
+    value, test_name, bench_class, bench_type, operator, params, display_name=None
+):
    return {
        "value": value,
        "test": test_name,
@@ -163,7 +230,8 @@ def _create_point(value, test_name, bench_class, bench_type, operator, params, d
        "class": bench_class,
        "type": bench_type,
        "operator": operator,
-        "params": params}
+        "params": params,
+    }


 def parse_benchmark_file(directory):
@@ -206,21 +274,24 @@ def _parse_key_results(result_file, bench_type):

    with result_file.open() as csv_file:
        reader = csv.reader(csv_file)
-        for (test_name, value) in reader:
+        for test_name, value in reader:
            try:
                params, display_name, operator = get_parameters(test_name)
            except Exception as err:
                parsing_failures.append((test_name, f"failed to get parameters: {err}"))
                continue

-            result_values.append({
-                "value": int(value),
-                "test": test_name,
-                "name": display_name,
-                "class": "keygen",
-                "type": bench_type,
-                "operator": operator,
-                "params": params})
+            result_values.append(
+                {
+                    "value": int(value),
+                    "test": test_name,
+                    "name": display_name,
+                    "class": "keygen",
+                    "type": bench_type,
+                    "operator": operator,
+                    "params": params,
+                }
+            )

    return result_values, parsing_failures

@@ -288,7 +359,7 @@ def compute_ops_per_second(data_point):

    :return: number of operations per second
    """
-    return 1E9 / data_point
+    return 1e9 / data_point


 def _parse_file_to_json(directory, filename):
@@ -337,9 +408,16 @@ def check_mandatory_args(input_args):

    missing_args = []
    for arg_name in vars(input_args):
-        if arg_name in ["results_dir", "output_file", "name_suffix",
-                        "append_results", "walk_subdirs", "key_sizes",
-                        "key_gen", "throughput"]:
+        if arg_name in [
+            "results_dir",
+            "output_file",
+            "name_suffix",
+            "append_results",
+            "walk_subdirs",
+            "key_sizes",
+            "key_gen",
+            "throughput",
+        ]:
            continue
        if not getattr(input_args, arg_name):
            missing_args.append(arg_name)
@@ -354,7 +432,7 @@ if __name__ == "__main__":
    args = parser.parse_args()
    check_mandatory_args(args)

-    #failures = []
+    # failures = []
    raw_results = pathlib.Path(args.results)
    if args.key_sizes or args.key_gen:
        if args.key_sizes:
@@ -370,7 +448,8 @@ if __name__ == "__main__":
        if args.throughput:
            print("Throughput computation enabled")
            ec2_costs = json.loads(
-                pathlib.Path("ci/ec2_products_cost.json").read_text(encoding="utf-8"))
+                pathlib.Path("ci/ec2_products_cost.json").read_text(encoding="utf-8")
+            )
            try:
                hardware_cost = abs(ec2_costs[args.hardware])
                print(f"Hardware hourly cost: {hardware_cost} $/h")
@@ -378,8 +457,13 @@ if __name__ == "__main__":
                print(f"Cannot find hardware hourly cost for '{args.hardware}'")
                sys.exit(1)

-        results, failures = recursive_parse(raw_results, args.walk_subdirs, args.name_suffix,
-                                            args.throughput, hardware_cost)
+        results, failures = recursive_parse(
+            raw_results,
+            args.walk_subdirs,
+            args.name_suffix,
+            args.throughput,
+            hardware_cost,
+        )

    print("Parsing results done")

--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -3,5 +3,6 @@
  "hpc7a.96xlarge": 7.7252,
  "p3.2xlarge": 3.06,
  "p4d.24xlarge": 32.7726,
-  "p5.48xlarge": 98.32
+  "p5.48xlarge": 98.32,
+  "rtx4090": 0.04
 }
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe"
-version = "0.5.0"
+version = "0.6.0"
 edition = "2021"
 readme = "../README.md"
 keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
@@ -17,7 +17,7 @@ exclude = [
    "/js_on_wasm_tests/",
    "/web_wasm_parallel_tests/",
 ]
-rust-version = "1.72"
+rust-version = "1.73"

 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

@@ -62,7 +62,7 @@ rayon = { version = "1.5.0" }
 bincode = "1.3.3"
 concrete-fft = { version = "0.4.0", features = ["serde", "fft128"] }
 pulp = "0.18.8"
-tfhe-cuda-backend = { version = "0.1.2", path = "../backends/tfhe-cuda-backend", optional = true }
+tfhe-cuda-backend = { version = "0.2.0", path = "../backends/tfhe-cuda-backend", optional = true }
 aligned-vec = { version = "0.5", features = ["serde"] }
 dyn-stack = { version = "0.9" }
 paste = "1.0.7"
@@ -128,7 +128,6 @@ generator_aarch64_aes = ["concrete-csprng/generator_aarch64_aes"]

 # Private features
 __profiling = []
-__coverage = []

 seeder_unix = ["concrete-csprng/seeder_unix"]
 seeder_x86_64_rdseed = ["concrete-csprng/seeder_x86_64_rdseed"]
@@ -170,7 +169,7 @@ required-features = ["boolean", "shortint", "internal-keycache"]
 name = "dev-bench"
 path = "benches/core_crypto/dev_bench.rs"
 harness = false
-required-features = ["experimental", "internal-keycache"]
+required-features = ["internal-keycache"]

 [[bench]]
 name = "pbs128-bench"
@@ -295,5 +294,9 @@ required-features = ["integer"]
 name = "pbs_count"
 required-features = ["integer", "pbs-stats"]

+[[example]]
+name = "dist_tuniform"
+required-features = ["integer", "internal-keycache"]
+
 [lib]
 crate-type = ["lib", "staticlib", "cdylib"]
--- a/tfhe/benches/core_crypto/dev_bench.rs
+++ b/tfhe/benches/core_crypto/dev_bench.rs
@@ -63,7 +63,7 @@ fn multi_bit_pbs<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<usize> + Syn

    let (
        mut input_lwe_dimension,
-        lwe_modular_std_dev,
+        lwe_std_dev,
        decomp_base_log,
        decomp_level_count,
        glwe_dimension,
@@ -72,6 +72,11 @@ fn multi_bit_pbs<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<usize> + Syn
        thread_count,
    ) = get_bench_params::<Scalar>();

+    let lwe_noise_distribution = Gaussian {
+        std: lwe_std_dev.0,
+        mean: 0.0,
+    };
+
    let ciphertext_modulus = CiphertextModulus::new_native();

    while input_lwe_dimension.0 % grouping_factor.0 != 0 {
@@ -110,7 +115,7 @@ fn multi_bit_pbs<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<usize> + Syn
    let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
        &input_lwe_secret_key,
        Plaintext(Scalar::ZERO),
-        lwe_modular_std_dev,
+        lwe_noise_distribution,
        ciphertext_modulus,
        &mut encryption_generator,
    );
@@ -154,7 +159,7 @@ fn pbs<Scalar: UnsignedTorus + CastInto<usize>>(c: &mut Criterion) {

    let (
        input_lwe_dimension,
-        lwe_modular_std_dev,
+        lwe_std_dev,
        decomp_base_log,
        decomp_level_count,
        glwe_dimension,
@@ -163,6 +168,11 @@ fn pbs<Scalar: UnsignedTorus + CastInto<usize>>(c: &mut Criterion) {
        _,
    ) = get_bench_params::<Scalar>();

+    let lwe_noise_distribution = Gaussian {
+        std: lwe_std_dev.0,
+        mean: 0.0,
+    };
+
    let ciphertext_modulus = CiphertextModulus::new_native();

    // Create the PRNG
@@ -197,7 +207,7 @@ fn pbs<Scalar: UnsignedTorus + CastInto<usize>>(c: &mut Criterion) {
    let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
        &input_lwe_secret_key,
        Plaintext(Scalar::ZERO),
-        lwe_modular_std_dev,
+        lwe_noise_distribution,
        ciphertext_modulus,
        &mut encryption_generator,
    );
@@ -239,7 +249,7 @@ fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize>>(c: &mut Criterion)

    let (
        input_lwe_dimension,
-        lwe_modular_std_dev,
+        lwe_std_dev,
        decomp_base_log,
        decomp_level_count,
        glwe_dimension,
@@ -248,6 +258,13 @@ fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize>>(c: &mut Criterion)
        _,
    ) = get_bench_params::<Scalar>();

+    let lwe_noise_distribution = Gaussian {
+        std: lwe_std_dev.0,
+        mean: 0.0,
+    };
+
+    let ciphertext_modulus = CiphertextModulus::new_native();
+
    // Create the PRNG
    let mut seeder = new_seeder();
    let seeder = seeder.as_mut();
@@ -280,7 +297,7 @@ fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize>>(c: &mut Criterion)
    let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
        &input_lwe_secret_key,
        Plaintext(Scalar::ZERO),
-        lwe_modular_std_dev,
+        lwe_noise_distribution,
        ciphertext_modulus,
        &mut encryption_generator,
    );
--- a/tfhe/benches/core_crypto/ks_bench.rs
+++ b/tfhe/benches/core_crypto/ks_bench.rs
@@ -104,7 +104,8 @@ fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(criterion: &mu

    for (name, params) in benchmark_parameters::<Scalar>().iter() {
        let lwe_dimension = params.lwe_dimension.unwrap();
-        let lwe_modular_std_dev = params.lwe_modular_std_dev.unwrap();
+        let lwe_noise_distribution =
+            DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
        let glwe_dimension = params.glwe_dimension.unwrap();
        let polynomial_size = params.polynomial_size.unwrap();
        let ks_decomp_base_log = params.ks_base_log.unwrap();
@@ -124,7 +125,7 @@ fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(criterion: &mu
            &lwe_sk,
            ks_decomp_base_log,
            ks_decomp_level_count,
-            lwe_modular_std_dev,
+            lwe_noise_distribution,
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
            &mut encryption_generator,
        );
@@ -132,7 +133,7 @@ fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(criterion: &mu
        let ct = allocate_and_encrypt_new_lwe_ciphertext(
            &big_lwe_sk,
            Plaintext(Scalar::ONE),
-            lwe_modular_std_dev,
+            lwe_noise_distribution,
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
            &mut encryption_generator,
        );
@@ -143,7 +144,7 @@ fn keyswitch<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(criterion: &mu
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
        );

-        let id = format!("{bench_name}_{name}");
+        let id = format!("{bench_name}::{name}");
        {
            bench_group.bench_function(&id, |b| {
                b.iter(|| {
@@ -197,7 +198,8 @@ mod cuda {

        for (name, params) in benchmark_parameters::<Scalar>().iter() {
            let lwe_dimension = params.lwe_dimension.unwrap();
-            let lwe_modular_std_dev = params.lwe_modular_std_dev.unwrap();
+            let lwe_noise_distribution =
+                DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
            let glwe_dimension = params.glwe_dimension.unwrap();
            let polynomial_size = params.polynomial_size.unwrap();
            let ks_decomp_base_log = params.ks_base_log.unwrap();
@@ -219,7 +221,7 @@ mod cuda {
                &lwe_sk,
                ks_decomp_base_log,
                ks_decomp_level_count,
-                lwe_modular_std_dev,
+                lwe_noise_distribution,
                CiphertextModulus::new_native(),
                &mut encryption_generator,
            );
@@ -229,7 +231,7 @@ mod cuda {
            let ct = allocate_and_encrypt_new_lwe_ciphertext(
                &big_lwe_sk,
                Plaintext(Scalar::ONE),
-                lwe_modular_std_dev,
+                lwe_noise_distribution,
                CiphertextModulus::new_native(),
                &mut encryption_generator,
            );
@@ -251,7 +253,7 @@ mod cuda {
            }
            stream.synchronize();

-            let id = format!("{bench_name}_{name}");
+            let id = format!("{bench_name}::{name}");
            {
                bench_group.bench_function(&id, |b| {
                    b.iter(|| {
--- a/tfhe/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe/benches/core_crypto/pbs128_bench.rs
@@ -14,7 +14,8 @@ fn criterion_bench(c: &mut Criterion) {
        let small_lwe_dimension = LweDimension(742);
        let glwe_dimension = GlweDimension(1);
        let polynomial_size = PolynomialSize(2048);
-        let lwe_modular_std_dev = StandardDev(sqr(0.000007069849454709433));
+        let lwe_noise_distribution =
+            Gaussian::from_standard_dev(StandardDev(sqr(0.000007069849454709433)), 0.0);
        let pbs_base_log = DecompositionBaseLog(23);
        let pbs_level = DecompositionLevelCount(1);
        let ciphertext_modulus = CiphertextModulus::new_native();
@@ -61,7 +62,7 @@ fn criterion_bench(c: &mut Criterion) {
        let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
            &small_lwe_sk,
            plaintext,
-            lwe_modular_std_dev,
+            lwe_noise_distribution,
            ciphertext_modulus,
            &mut encryption_generator,
        );
--- a/tfhe/benches/core_crypto/pbs_bench.rs
+++ b/tfhe/benches/core_crypto/pbs_bench.rs
@@ -70,23 +70,32 @@ fn benchmark_parameters<Scalar: UnsignedInteger>() -> Vec<(String, CryptoParamet

 fn throughput_benchmark_parameters<Scalar: UnsignedInteger>(
 ) -> Vec<(String, CryptoParametersRecord<Scalar>)> {
-    if Scalar::BITS == 64 {
+    let parameters = if cfg!(feature = "gpu") {
+        vec![
+            PARAM_MESSAGE_1_CARRY_1_KS_PBS,
+            PARAM_MESSAGE_2_CARRY_2_KS_PBS,
+            PARAM_MESSAGE_3_CARRY_3_KS_PBS,
+        ]
+    } else {
        vec![
            PARAM_MESSAGE_1_CARRY_1_KS_PBS,
            PARAM_MESSAGE_2_CARRY_2_KS_PBS,
            PARAM_MESSAGE_3_CARRY_3_KS_PBS,
            PARAM_MESSAGE_4_CARRY_4_KS_PBS,
        ]
-        .iter()
-        .map(|params| {
-            (
-                params.name(),
-                <ClassicPBSParameters as Into<PBSParameters>>::into(*params)
-                    .to_owned()
-                    .into(),
-            )
-        })
-        .collect()
+    };
+    if Scalar::BITS == 64 {
+        parameters
+            .iter()
+            .map(|params| {
+                (
+                    params.name(),
+                    <ClassicPBSParameters as Into<PBSParameters>>::into(*params)
+                        .to_owned()
+                        .into(),
+                )
+            })
+            .collect()
    } else if Scalar::BITS == 32 {
        BOOLEAN_BENCH_PARAMS
            .iter()
@@ -137,6 +146,9 @@ fn multi_bit_benchmark_parameters<Scalar: UnsignedInteger + Default>(
 fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(c: &mut Criterion) {
    let bench_name = "core_crypto::pbs_mem_optimized";
    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));

    // Create the PRNG
    let mut seeder = new_seeder();
@@ -169,11 +181,14 @@ fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(c: &mu
            params.pbs_level.unwrap(),
        );

+        let lwe_noise_distribution =
+            DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
        // Allocate a new LweCiphertext and encrypt our plaintext
        let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
            &input_lwe_secret_key,
            Plaintext(Scalar::ZERO),
-            params.lwe_modular_std_dev.unwrap(),
+            lwe_noise_distribution,
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
            &mut encryption_generator,
        );
@@ -207,7 +222,7 @@ fn mem_optimized_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(c: &mu
            .unaligned_bytes_required(),
        );

-        let id = format!("{bench_name}_{name}");
+        let id = format!("{bench_name}::{name}");
        {
            bench_group.bench_function(&id, |b| {
                b.iter(|| {
@@ -244,6 +259,9 @@ fn multi_bit_pbs<
 ) {
    let bench_name = "core_crypto::multi_bit_pbs";
    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));

    // Create the PRNG
    let mut seeder = new_seeder();
@@ -276,11 +294,14 @@ fn multi_bit_pbs<
            *grouping_factor,
        );

+        let lwe_noise_distribution =
+            DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
        // Allocate a new LweCiphertext and encrypt our plaintext
        let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
            &input_lwe_secret_key,
            Plaintext(Scalar::ZERO),
-            params.lwe_modular_std_dev.unwrap(),
+            lwe_noise_distribution,
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
            &mut encryption_generator,
        );
@@ -299,7 +320,7 @@ fn multi_bit_pbs<
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
        );

-        let id = format!("{bench_name}_{name}_parallelized");
+        let id = format!("{bench_name}::{name}::parallelized");
        bench_group.bench_function(&id, |b| {
            b.iter(|| {
                multi_bit_programmable_bootstrap_lwe_ciphertext(
@@ -333,6 +354,9 @@ fn multi_bit_deterministic_pbs<
 ) {
    let bench_name = "core_crypto::multi_bit_deterministic_pbs";
    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));

    // Create the PRNG
    let mut seeder = new_seeder();
@@ -365,11 +389,14 @@ fn multi_bit_deterministic_pbs<
            *grouping_factor,
        );

+        let lwe_noise_distribution =
+            DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
        // Allocate a new LweCiphertext and encrypt our plaintext
        let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
            &input_lwe_secret_key,
            Plaintext(Scalar::ZERO),
-            params.lwe_modular_std_dev.unwrap(),
+            lwe_noise_distribution,
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
            &mut encryption_generator,
        );
@@ -388,7 +415,7 @@ fn multi_bit_deterministic_pbs<
            tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
        );

-        let id = format!("{bench_name}_{name}_parallelized");
+        let id = format!("{bench_name}::{name}::parallelized");
        bench_group.bench_function(&id, |b| {
            b.iter(|| {
                multi_bit_deterministic_programmable_bootstrap_lwe_ciphertext(
@@ -420,6 +447,9 @@ fn pbs_throughput<Scalar: UnsignedTorus + CastInto<usize> + Sync + Send + Serial
 ) {
    let bench_name = "core_crypto::pbs_throughput";
    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(60));

    // Create the PRNG
    let mut seeder = new_seeder();
@@ -443,13 +473,16 @@ fn pbs_throughput<Scalar: UnsignedTorus + CastInto<usize> + Sync + Send + Serial
        let big_lwe_sk = glwe_secret_key.into_lwe_secret_key();
        let big_lwe_dimension = big_lwe_sk.lwe_dimension();

-        const NUM_CTS: usize = 512;
+        let lwe_noise_distribution =
+            DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
+        const NUM_CTS: usize = 8192;
        let lwe_vec: Vec<_> = (0..NUM_CTS)
            .map(|_| {
                allocate_and_encrypt_new_lwe_ciphertext(
                    &input_lwe_secret_key,
                    Plaintext(Scalar::ZERO),
-                    params.lwe_modular_std_dev.unwrap(),
+                    lwe_noise_distribution,
                    tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
                    &mut encryption_generator,
                )
@@ -497,8 +530,8 @@ fn pbs_throughput<Scalar: UnsignedTorus + CastInto<usize> + Sync + Send + Serial
            params.pbs_level.unwrap(),
        );

-        for chunk_size in [1, 16, 32, 64, 128, 256, 512] {
-            let id = format!("{bench_name}_{name}_{chunk_size}chunk");
+        for chunk_size in [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192] {
+            let id = format!("{bench_name}::{name}::{chunk_size}chunk");
            {
                bench_group.bench_function(&id, |b| {
                    b.iter(|| {
@@ -538,7 +571,7 @@ fn pbs_throughput<Scalar: UnsignedTorus + CastInto<usize> + Sync + Send + Serial

 #[cfg(feature = "gpu")]
 mod cuda {
-    use super::multi_bit_benchmark_parameters;
+    use super::{multi_bit_benchmark_parameters, throughput_benchmark_parameters};
    use crate::utilities::{write_to_json, CryptoParametersRecord, OperatorType};
    use criterion::{black_box, criterion_group, Criterion};
    use serde::Serialize;
@@ -602,6 +635,9 @@ mod cuda {
    fn cuda_pbs<Scalar: UnsignedTorus + CastInto<usize> + Serialize>(c: &mut Criterion) {
        let bench_name = "core_crypto::cuda::pbs";
        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));

        // Create the PRNG
        let mut seeder = new_seeder();
@@ -640,11 +676,14 @@ mod cuda {
            );
            let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);

+            let lwe_noise_distribution =
+                DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
            // Allocate a new LweCiphertext and encrypt our plaintext
            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
                &input_lwe_secret_key,
                Plaintext(Scalar::ZERO),
-                params.lwe_modular_std_dev.unwrap(),
+                lwe_noise_distribution,
                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
                &mut encryption_generator,
            );
@@ -680,7 +719,7 @@ mod cuda {
            }
            stream.synchronize();

-            let id = format!("{bench_name}_{name}");
+            let id = format!("{bench_name}::{name}");
            {
                bench_group.bench_function(&id, |b| {
                    b.iter(|| {
@@ -720,6 +759,9 @@ mod cuda {
    ) {
        let bench_name = "core_crypto::cuda::multi_bit_pbs";
        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));

        // Create the PRNG
        let mut seeder = new_seeder();
@@ -762,11 +804,14 @@ mod cuda {
                &stream,
            );

+            let lwe_noise_distribution =
+                DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
            // Allocate a new LweCiphertext and encrypt our plaintext
            let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
                &input_lwe_secret_key,
                Plaintext(Scalar::ZERO),
-                params.lwe_modular_std_dev.unwrap(),
+                lwe_noise_distribution,
                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
                &mut encryption_generator,
            );
@@ -802,7 +847,7 @@ mod cuda {
            }
            stream.synchronize();

-            let id = format!("{bench_name}_{name}");
+            let id = format!("{bench_name}::{name}");
            bench_group.bench_function(&id, |b| {
                b.iter(|| {
                    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext(
@@ -832,44 +877,337 @@ mod cuda {
        }
    }

-    criterion_group!(
-        name = cuda_pbs_group;
-        config = Criterion::default().sample_size(2000);
-        targets = cuda_pbs::<u64>
-    );
+    fn cuda_pbs_throughput<
+        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<usize> + Default + Serialize + Sync,
+    >(
+        c: &mut Criterion,
+    ) {
+        let bench_name = "core_crypto::cuda::pbs_throughput";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));
+
+        // Create the PRNG
+        let mut seeder = new_seeder();
+        let seeder = seeder.as_mut();
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed(), seeder);
+        let mut secret_generator =
+            SecretRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed());
+
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (name, params) in throughput_benchmark_parameters::<Scalar>().iter() {
+            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
+                params.lwe_dimension.unwrap(),
+                &mut secret_generator,
+            );
+
+            let glwe_secret_key = GlweSecretKey::new_empty_key(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap(),
+                params.polynomial_size.unwrap(),
+            );
+            let big_lwe_sk = glwe_secret_key.into_lwe_secret_key();
+            let big_lwe_dimension = big_lwe_sk.lwe_dimension();
+            let bsk = LweBootstrapKey::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                params.pbs_base_log.unwrap(),
+                params.pbs_level.unwrap(),
+                params.lwe_dimension.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let bsk_gpu = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, &stream);
+
+            const NUM_CTS: usize = 8192;
+            let plaintext_list = PlaintextList::new(Scalar::ZERO, PlaintextCount(NUM_CTS));
+            let lwe_noise_distribution =
+                DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
+            let mut lwe_list = LweCiphertextList::new(
+                Scalar::ZERO,
+                params.lwe_dimension.unwrap().to_lwe_size(),
+                LweCiphertextCount(NUM_CTS),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            encrypt_lwe_ciphertext_list(
+                &input_lwe_secret_key,
+                &mut lwe_list,
+                &plaintext_list,
+                lwe_noise_distribution,
+                &mut encryption_generator,
+            );
+            let underlying_container: Vec<Scalar> = lwe_list.into_container();
+
+            let input_lwe_list = LweCiphertextList::from_container(
+                underlying_container,
+                params.lwe_dimension.unwrap().to_lwe_size(),
+                params.ciphertext_modulus.unwrap(),
+            );
+
+            let output_lwe_list = LweCiphertextList::new(
+                Scalar::ZERO,
+                big_lwe_dimension.to_lwe_size(),
+                LweCiphertextCount(NUM_CTS),
+                params.ciphertext_modulus.unwrap(),
+            );
+            let lwe_ciphertext_in_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream);
+
+            let accumulator = GlweCiphertext::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let accumulator_gpu =
+                CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            let mut out_pbs_ct_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext_list(&output_lwe_list, &stream);
+            let mut h_indexes: [Scalar; NUM_CTS] = [Scalar::ZERO; NUM_CTS];
+            let mut d_lut_indexes = unsafe { CudaVec::<Scalar>::new_async(NUM_CTS, &stream) };
+            unsafe {
+                d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream);
+            }
+            stream.synchronize();
+            for (i, index) in h_indexes.iter_mut().enumerate() {
+                *index = Scalar::cast_from(i);
+            }
+            stream.synchronize();
+            let mut d_input_indexes = unsafe { CudaVec::<Scalar>::new_async(NUM_CTS, &stream) };
+            let mut d_output_indexes = unsafe { CudaVec::<Scalar>::new_async(NUM_CTS, &stream) };
+            unsafe {
+                d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream);
+                d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream);
+            }
+            stream.synchronize();
+
+            let id = format!("{bench_name}::{name}::{NUM_CTS}chunk");
+            bench_group.bench_function(&id, |b| {
+                b.iter(|| {
+                    cuda_programmable_bootstrap_lwe_ciphertext(
+                        &lwe_ciphertext_in_gpu,
+                        &mut out_pbs_ct_gpu,
+                        &accumulator_gpu,
+                        &d_lut_indexes,
+                        &d_output_indexes,
+                        &d_input_indexes,
+                        LweCiphertextCount(NUM_CTS),
+                        &bsk_gpu,
+                        &stream,
+                    );
+                    black_box(&mut out_pbs_ct_gpu);
+                })
+            });
+
+            let bit_size = params.message_modulus.unwrap().ilog2();
+            write_to_json(
+                &id,
+                *params,
+                name,
+                "pbs",
+                &OperatorType::Atomic,
+                bit_size,
+                vec![bit_size],
+            );
+        }
+    }
+
+    fn cuda_multi_bit_pbs_throughput<
+        Scalar: UnsignedTorus + CastInto<usize> + CastFrom<usize> + Default + Serialize + Sync,
+    >(
+        c: &mut Criterion,
+    ) {
+        let bench_name = "core_crypto::cuda::multi_bit_pbs_throughput";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));
+
+        // Create the PRNG
+        let mut seeder = new_seeder();
+        let seeder = seeder.as_mut();
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed(), seeder);
+        let mut secret_generator =
+            SecretRandomGenerator::<ActivatedRandomGenerator>::new(seeder.seed());
+
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (name, params, grouping_factor) in multi_bit_benchmark_parameters::<Scalar>().iter() {
+            let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
+                params.lwe_dimension.unwrap(),
+                &mut secret_generator,
+            );
+
+            let glwe_secret_key = GlweSecretKey::new_empty_key(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap(),
+                params.polynomial_size.unwrap(),
+            );
+            let big_lwe_sk = glwe_secret_key.into_lwe_secret_key();
+            let big_lwe_dimension = big_lwe_sk.lwe_dimension();
+            let multi_bit_bsk = LweMultiBitBootstrapKey::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                params.pbs_base_log.unwrap(),
+                params.pbs_level.unwrap(),
+                params.lwe_dimension.unwrap(),
+                *grouping_factor,
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let multi_bit_bsk_gpu = CudaLweMultiBitBootstrapKey::from_lwe_multi_bit_bootstrap_key(
+                &multi_bit_bsk,
+                &stream,
+            );
+
+            const NUM_CTS: usize = 8192;
+            let lwe_noise_distribution =
+                DynamicDistribution::new_gaussian_from_std_dev(params.lwe_std_dev.unwrap());
+
+            let plaintext_list = PlaintextList::new(Scalar::ZERO, PlaintextCount(NUM_CTS));
+            let mut lwe_list = LweCiphertextList::new(
+                Scalar::ZERO,
+                params.lwe_dimension.unwrap().to_lwe_size(),
+                LweCiphertextCount(NUM_CTS),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            encrypt_lwe_ciphertext_list(
+                &input_lwe_secret_key,
+                &mut lwe_list,
+                &plaintext_list,
+                lwe_noise_distribution,
+                &mut encryption_generator,
+            );
+            let underlying_container: Vec<Scalar> = lwe_list.into_container();
+
+            let input_lwe_list = LweCiphertextList::from_container(
+                underlying_container,
+                params.lwe_dimension.unwrap().to_lwe_size(),
+                params.ciphertext_modulus.unwrap(),
+            );
+
+            let output_lwe_list = LweCiphertextList::new(
+                Scalar::ZERO,
+                big_lwe_dimension.to_lwe_size(),
+                LweCiphertextCount(NUM_CTS),
+                params.ciphertext_modulus.unwrap(),
+            );
+            let lwe_ciphertext_in_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream);
+
+            let accumulator = GlweCiphertext::new(
+                Scalar::ZERO,
+                params.glwe_dimension.unwrap().to_glwe_size(),
+                params.polynomial_size.unwrap(),
+                tfhe::core_crypto::prelude::CiphertextModulus::new_native(),
+            );
+            let accumulator_gpu =
+                CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
+
+            let mut out_pbs_ct_gpu =
+                CudaLweCiphertextList::from_lwe_ciphertext_list(&output_lwe_list, &stream);
+            let mut h_indexes: [Scalar; NUM_CTS] = [Scalar::ZERO; NUM_CTS];
+            let mut d_lut_indexes = unsafe { CudaVec::<Scalar>::new_async(NUM_CTS, &stream) };
+            unsafe {
+                d_lut_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream);
+            }
+            stream.synchronize();
+            for (i, index) in h_indexes.iter_mut().enumerate() {
+                *index = Scalar::cast_from(i);
+            }
+            stream.synchronize();
+            let mut d_input_indexes = unsafe { CudaVec::<Scalar>::new_async(NUM_CTS, &stream) };
+            let mut d_output_indexes = unsafe { CudaVec::<Scalar>::new_async(NUM_CTS, &stream) };
+            unsafe {
+                d_input_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream);
+                d_output_indexes.copy_from_cpu_async(h_indexes.as_ref(), &stream);
+            }
+            stream.synchronize();
+
+            let id = format!("{bench_name}::{name}::{NUM_CTS}chunk");
+            bench_group.bench_function(&id, |b| {
+                b.iter(|| {
+                    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext(
+                        &lwe_ciphertext_in_gpu,
+                        &mut out_pbs_ct_gpu,
+                        &accumulator_gpu,
+                        &d_lut_indexes,
+                        &d_output_indexes,
+                        &d_input_indexes,
+                        &multi_bit_bsk_gpu,
+                        &stream,
+                    );
+                    black_box(&mut out_pbs_ct_gpu);
+                })
+            });
+
+            let bit_size = params.message_modulus.unwrap().ilog2();
+            write_to_json(
+                &id,
+                *params,
+                name,
+                "pbs",
+                &OperatorType::Atomic,
+                bit_size,
+                vec![bit_size],
+            );
+        }
+    }
+
+    criterion_group!(cuda_pbs_group, cuda_pbs::<u64>);
+
+    criterion_group!(cuda_multi_bit_pbs_group, cuda_multi_bit_pbs::<u64>);
+
+    criterion_group!(cuda_pbs_throughput_group, cuda_pbs_throughput::<u64>);

    criterion_group!(
-        name = cuda_multi_bit_pbs_group;
-        config = Criterion::default().sample_size(2000);
-        targets = cuda_multi_bit_pbs::<u64>
+        cuda_multi_bit_pbs_throughput_group,
+        cuda_multi_bit_pbs_throughput::<u64>
    );
 }

 #[cfg(feature = "gpu")]
-use cuda::{cuda_multi_bit_pbs_group, cuda_pbs_group};
+use cuda::{
+    cuda_multi_bit_pbs_group, cuda_multi_bit_pbs_throughput_group, cuda_pbs_group,
+    cuda_pbs_throughput_group,
+};

 criterion_group!(
-    name = pbs_group;
-    config = Criterion::default().sample_size(2000);
-    targets = mem_optimized_pbs::<u64>, mem_optimized_pbs::<u32>
+    pbs_group,
+    mem_optimized_pbs::<u64>,
+    mem_optimized_pbs::<u32>
 );

 criterion_group!(
-    name = multi_bit_pbs_group;
-    config = Criterion::default().sample_size(2000);
-    targets =   multi_bit_pbs::<u64>,
-                multi_bit_pbs::<u32>,
-                multi_bit_deterministic_pbs::<u64>,
-                multi_bit_deterministic_pbs::<u32>,
+    multi_bit_pbs_group,
+    multi_bit_pbs::<u64>,
+    multi_bit_pbs::<u32>,
+    multi_bit_deterministic_pbs::<u64>,
+    multi_bit_deterministic_pbs::<u32>,
 );

 criterion_group!(
-    name = pbs_throughput_group;
-    config = Criterion::default().sample_size(100);
-    targets = pbs_throughput::<u64>, pbs_throughput::<u32>
+    pbs_throughput_group,
+    pbs_throughput::<u64>,
+    pbs_throughput::<u32>
 );

 #[cfg(not(feature = "gpu"))]
 criterion_main!(pbs_group, multi_bit_pbs_group, pbs_throughput_group);
 #[cfg(feature = "gpu")]
-criterion_main!(cuda_pbs_group, cuda_multi_bit_pbs_group);
+criterion_main!(
+    cuda_pbs_group,
+    cuda_multi_bit_pbs_group,
+    cuda_pbs_throughput_group,
+    cuda_multi_bit_pbs_throughput_group
+);
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
@@ -11,7 +11,6 @@ use itertools::iproduct;
 use rand::prelude::*;
 use rand::Rng;
 use std::vec::IntoIter;
-use tfhe::core_crypto::algorithms::misc::divide_ceil;
 use tfhe::integer::keycache::KEY_CACHE;
 use tfhe::integer::{IntegerKeyKind, RadixCiphertext, RadixClientKey, ServerKey};
 use tfhe::keycache::NamedParam;
@@ -561,7 +560,7 @@ fn ciphertexts_sum_parallelized(c: &mut Criterion) {
            bench_group.bench_function(&bench_id, |b| {
                let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);

-                let nb_ctxt = divide_ceil(bit_size, param.message_modulus().0.ilog2() as usize);
+                let nb_ctxt = bit_size.div_ceil(param.message_modulus().0.ilog2() as usize);
                let cks = RadixClientKey::from((cks, nb_ctxt));

                let encrypt_values = || {
@@ -1068,6 +1067,12 @@ define_server_key_bench_unary_fn!(method_name: smart_abs_parallelized, display_n

 define_server_key_bench_unary_default_fn!(method_name: neg_parallelized, display_name: negation);
 define_server_key_bench_unary_default_fn!(method_name: abs_parallelized, display_name: abs);
+define_server_key_bench_unary_default_fn!(method_name: leading_zeros, display_name: leading_zeros);
+define_server_key_bench_unary_default_fn!(method_name: leading_ones, display_name: leading_ones);
+define_server_key_bench_unary_default_fn!(method_name: trailing_zeros, display_name: trailing_zeros);
+define_server_key_bench_unary_default_fn!(method_name: trailing_ones, display_name: trailing_ones);
+define_server_key_bench_unary_default_fn!(method_name: ilog2, display_name: ilog2);
+define_server_key_bench_unary_default_fn!(method_name: checked_ilog2, display_name: checked_ilog2);

 define_server_key_bench_unary_default_fn!(method_name: unchecked_abs_parallelized, display_name: abs);

@@ -1184,9 +1189,8 @@ mod cuda {
    use super::*;
    use crate::utilities::{write_to_json, EnvConfig, OperatorType};
    use criterion::{criterion_group, Criterion};
-    use tfhe::core_crypto::algorithms::misc::divide_ceil;
    use tfhe::core_crypto::gpu::{CudaDevice, CudaStream};
-    use tfhe::integer::gpu::ciphertext::CudaRadixCiphertext;
+    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
    use tfhe::integer::gpu::server_key::CudaServerKey;
    use tfhe::integer::keycache::KEY_CACHE;
    use tfhe::integer::IntegerKeyKind;
@@ -1198,7 +1202,7 @@ mod cuda {
        display_name: &str,
        unary_op: F,
    ) where
-        F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, &CudaStream),
+        F: Fn(&CudaServerKey, &mut CudaUnsignedRadixCiphertext, &CudaStream),
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -1225,7 +1229,7 @@ mod cuda {
                    let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
                    let ct_0 = cks.encrypt_radix(clear_0, num_block);

-                    CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream)
+                    CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &stream)
                };

                b.iter_batched(
@@ -1259,7 +1263,12 @@ mod cuda {
        display_name: &str,
        binary_op: F,
    ) where
-        F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, &mut CudaRadixCiphertext, &CudaStream),
+        F: Fn(
+            &CudaServerKey,
+            &mut CudaUnsignedRadixCiphertext,
+            &mut CudaUnsignedRadixCiphertext,
+            &CudaStream,
+        ),
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -1291,8 +1300,10 @@ mod cuda {
                    let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh));
                    let ct_1 = cks.encrypt_radix(clear_1, num_block);

-                    let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
-                    let d_ctxt_2 = CudaRadixCiphertext::from_radix_ciphertext(&ct_1, &stream);
+                    let d_ctxt_1 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
+                    let d_ctxt_2 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_1, &stream);

                    (d_ctxt_1, d_ctxt_2)
                };
@@ -1327,7 +1338,7 @@ mod cuda {
        binary_op: F,
        rng_func: G,
    ) where
-        F: Fn(&CudaServerKey, &mut CudaRadixCiphertext, ScalarType, &CudaStream),
+        F: Fn(&CudaServerKey, &mut CudaUnsignedRadixCiphertext, ScalarType, &CudaStream),
        G: Fn(&mut ThreadRng, usize) -> ScalarType,
    {
        let mut bench_group = c.benchmark_group(bench_name);
@@ -1360,7 +1371,8 @@ mod cuda {
                    let clear_0 = tfhe::integer::U256::from((clearlow, clearhigh));
                    let ct_0 = cks.encrypt_radix(clear_0, num_block);

-                    let d_ctxt_1 = CudaRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);
+                    let d_ctxt_1 =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_0, &stream);

                    let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size;

@@ -1428,9 +1440,12 @@ mod cuda {
                    let clear_1 = tfhe::integer::U256::from((clearlow, clearhigh));
                    let ct_else = cks.encrypt_radix(clear_1, num_block);

-                    let d_ct_cond = CudaRadixCiphertext::from_radix_ciphertext(&ct_cond, &stream);
-                    let d_ct_then = CudaRadixCiphertext::from_radix_ciphertext(&ct_then, &stream);
-                    let d_ct_else = CudaRadixCiphertext::from_radix_ciphertext(&ct_else, &stream);
+                    let d_ct_cond =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_cond, &stream);
+                    let d_ct_then =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_then, &stream);
+                    let d_ct_else =
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_else, &stream);

                    (d_ct_cond, d_ct_then, d_ct_else)
                };
@@ -1904,7 +1919,7 @@ mod cuda {
        display_name: &str,
        cast_op: F,
    ) where
-        F: Fn(&CudaServerKey, CudaRadixCiphertext, usize),
+        F: Fn(&CudaServerKey, CudaUnsignedRadixCiphertext, usize),
    {
        let mut bench_group = c.benchmark_group(bench_name);
        bench_group
@@ -1922,7 +1937,7 @@ mod cuda {
                .bit_sizes()
                .iter()
                .copied()
-                .map(|bit| divide_ceil(bit, param.message_modulus().0.ilog2() as usize))
+                .map(|bit| bit.div_ceil(param.message_modulus().0.ilog2() as usize))
                .collect::<Vec<_>>();
            let param_name = param.name();

@@ -1935,9 +1950,9 @@ mod cuda {
                    let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
                    let gpu_sks = CudaServerKey::new(&cks, &stream);

-                    let encrypt_one_value = || -> CudaRadixCiphertext {
+                    let encrypt_one_value = || -> CudaUnsignedRadixCiphertext {
                        let ct = cks.encrypt_radix(gen_random_u256(&mut rng), num_blocks);
-                        CudaRadixCiphertext::from_radix_ciphertext(&ct, &stream)
+                        CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream)
                    };

                    b.iter_batched(
@@ -2069,6 +2084,12 @@ criterion_group!(
    rotate_left_parallelized,
    rotate_right_parallelized,
    ciphertexts_sum_parallelized,
+    leading_zeros,
+    leading_ones,
+    trailing_zeros,
+    trailing_ones,
+    ilog2,
+    checked_ilog2,
 );

 criterion_group!(
@@ -2253,7 +2274,7 @@ fn bench_server_key_cast_function<F>(
            .bit_sizes()
            .iter()
            .copied()
-            .map(|bit| divide_ceil(bit, param.message_modulus().0.ilog2() as usize))
+            .map(|bit| bit.div_ceil(param.message_modulus().0.ilog2() as usize))
            .collect::<Vec<_>>();
        let param_name = param.name();

--- a/tfhe/benches/integer/signed_bench.rs
+++ b/tfhe/benches/integer/signed_bench.rs
@@ -9,14 +9,15 @@ use itertools::iproduct;
 use rand::prelude::*;
 use rand::Rng;
 use std::vec::IntoIter;
-use tfhe::core_crypto::algorithms::misc::divide_ceil;
 use tfhe::integer::keycache::KEY_CACHE;
 use tfhe::integer::{IntegerKeyKind, RadixCiphertext, ServerKey, SignedRadixCiphertext, I256};
 use tfhe::keycache::NamedParam;

-use tfhe::shortint::parameters::{
-    PARAM_MESSAGE_2_CARRY_2_KS_PBS, PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS,
-};
+use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
+#[cfg(not(feature = "gpu"))]
+use tfhe::shortint::parameters::PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS;
+#[cfg(feature = "gpu")]
+use tfhe::shortint::parameters::PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS;

 fn gen_random_i256(rng: &mut ThreadRng) -> I256 {
    let clearlow = rng.gen::<u128>();
@@ -38,6 +39,9 @@ impl Default for ParamsAndNumBlocksIter {
        let env_config = EnvConfig::new();

        if env_config.is_multi_bit {
+            #[cfg(feature = "gpu")]
+            let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS.into()];
+            #[cfg(not(feature = "gpu"))]
            let params = vec![PARAM_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_2_KS_PBS.into()];

            let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
@@ -337,6 +341,12 @@ define_server_key_bench_unary_signed_clean_input_fn!(
    method_name: abs_parallelized,
    display_name: abs
 );
+define_server_key_bench_unary_signed_clean_input_fn!(method_name: leading_zeros, display_name: leading_zeros);
+define_server_key_bench_unary_signed_clean_input_fn!(method_name: leading_ones, display_name: leading_ones);
+define_server_key_bench_unary_signed_clean_input_fn!(method_name: trailing_zeros, display_name: trailing_zeros);
+define_server_key_bench_unary_signed_clean_input_fn!(method_name: trailing_ones, display_name: trailing_ones);
+define_server_key_bench_unary_signed_clean_input_fn!(method_name: ilog2, display_name: ilog2);
+define_server_key_bench_unary_signed_clean_input_fn!(method_name: checked_ilog2, display_name: checked_ilog2);

 define_server_key_bench_binary_signed_clean_inputs_fn!(
    method_name: add_parallelized,
@@ -492,6 +502,12 @@ criterion_group!(
    right_shift_parallelized,
    rotate_left_parallelized,
    rotate_right_parallelized,
+    leading_zeros,
+    leading_ones,
+    trailing_zeros,
+    trailing_ones,
+    ilog2,
+    checked_ilog2,
 );

 criterion_group!(
@@ -1127,7 +1143,7 @@ fn bench_server_key_signed_cast_function<F>(
            .bit_sizes()
            .iter()
            .copied()
-            .map(|bit| divide_ceil(bit, param.message_modulus().0.ilog2() as usize))
+            .map(|bit| bit.div_ceil(param.message_modulus().0.ilog2() as usize))
            .collect::<Vec<_>>();
        let param_name = param.name();

@@ -1183,29 +1199,407 @@ define_server_key_bench_cast_fn!(method_name: cast_to_signed, display_name: cast

 criterion_group!(cast_ops, cast_to_unsigned, cast_to_signed);

+#[cfg(feature = "gpu")]
+mod cuda {
+    use super::*;
+    use crate::utilities::{write_to_json, OperatorType};
+    use criterion::{criterion_group, Criterion};
+    use tfhe::core_crypto::gpu::{CudaDevice, CudaStream};
+    use tfhe::integer::gpu::ciphertext::CudaSignedRadixCiphertext;
+    use tfhe::integer::gpu::server_key::CudaServerKey;
+    use tfhe::integer::keycache::KEY_CACHE;
+    use tfhe::integer::IntegerKeyKind;
+    use tfhe::keycache::NamedParam;
+
+    /// Base function to bench a server key function that is a binary operation, input ciphertext
+    /// will contain only zero carries
+    fn bench_cuda_server_key_binary_signed_function_clean_inputs<F>(
+        c: &mut Criterion,
+        bench_name: &str,
+        display_name: &str,
+        binary_op: F,
+    ) where
+        F: Fn(
+            &CudaServerKey,
+            &mut CudaSignedRadixCiphertext,
+            &mut CudaSignedRadixCiphertext,
+            &CudaStream,
+        ),
+    {
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));
+        let mut rng = rand::thread_rng();
+
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+            let param_name = param.name();
+
+            let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+
+            bench_group.bench_function(&bench_id, |b| {
+                let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                let encrypt_two_values = || {
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct_0 = cks.encrypt_signed_radix(clear_0, num_block);
+
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_1 = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct_1 = cks.encrypt_signed_radix(clear_1, num_block);
+
+                    let d_ctxt_1 =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream);
+                    let d_ctxt_2 =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_1, &stream);
+
+                    (d_ctxt_1, d_ctxt_2)
+                };
+
+                b.iter_batched(
+                    encrypt_two_values,
+                    |(mut ct_0, mut ct_1)| {
+                        binary_op(&gpu_sks, &mut ct_0, &mut ct_1, &stream);
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                display_name,
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_block],
+            );
+        }
+
+        bench_group.finish()
+    }
+
+    macro_rules! define_cuda_server_key_bench_clean_input_signed_fn (
+    (method_name: $server_key_method:ident, display_name:$name:ident) => {
+        ::paste::paste!{
+            fn [<cuda_ $server_key_method>](c: &mut Criterion) {
+                bench_cuda_server_key_binary_signed_function_clean_inputs(
+                    c,
+                    concat!("integer::cuda::signed::", stringify!($server_key_method)),
+                    stringify!($name),
+                    |server_key, lhs, rhs, stream| {
+                        server_key.$server_key_method(lhs, rhs, stream);
+                    }
+                )
+            }
+        }
+    }
+  );
+
+    /// Base function to bench a server key function that is a unary operation, input ciphertext
+    /// will contain only zero carries
+    fn bench_cuda_server_key_unary_signed_function_clean_inputs<F>(
+        c: &mut Criterion,
+        bench_name: &str,
+        display_name: &str,
+        unary_op: F,
+    ) where
+        F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, &CudaStream),
+    {
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));
+        let mut rng = rand::thread_rng();
+
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+            let param_name = param.name();
+
+            let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
+
+            bench_group.bench_function(&bench_id, |b| {
+                let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                let encrypt_one_value = || {
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct = cks.encrypt_signed_radix(clear, num_block);
+
+                    CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct, &stream)
+                };
+
+                b.iter_batched(
+                    encrypt_one_value,
+                    |mut ct| {
+                        unary_op(&gpu_sks, &mut ct, &stream);
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                display_name,
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_block],
+            );
+        }
+
+        bench_group.finish()
+    }
+
+    macro_rules! define_cuda_server_key_bench_clean_input_signed_unary_fn (
+    (method_name: $server_key_method:ident, display_name:$name:ident) => {
+        ::paste::paste!{
+            fn [<cuda_ $server_key_method>](c: &mut Criterion) {
+                bench_cuda_server_key_unary_signed_function_clean_inputs(
+                    c,
+                    concat!("integer::cuda::signed::", stringify!($server_key_method)),
+                    stringify!($name),
+                    |server_key, input, stream| {
+                        server_key.$server_key_method(input, stream);
+                    }
+                )
+            }
+        }
+    }
+  );
+
+    fn bench_cuda_server_key_binary_scalar_signed_function_clean_inputs<F, G>(
+        c: &mut Criterion,
+        bench_name: &str,
+        display_name: &str,
+        binary_op: F,
+        rng_func: G,
+    ) where
+        F: Fn(&CudaServerKey, &mut CudaSignedRadixCiphertext, ScalarType, &CudaStream),
+        G: Fn(&mut ThreadRng, usize) -> ScalarType,
+    {
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(60));
+        let mut rng = rand::thread_rng();
+
+        let gpu_index = 0;
+        let device = CudaDevice::new(gpu_index);
+        let stream = CudaStream::new_unchecked(device);
+
+        for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
+            if bit_size > ScalarType::BITS as usize {
+                break;
+            }
+            let param_name = param.name();
+
+            let max_value_for_bit_size = ScalarType::MAX >> (ScalarType::BITS as usize - bit_size);
+
+            let bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits_scalar_{bit_size}");
+            bench_group.bench_function(&bench_id, |b| {
+                let (cks, _cpu_sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+                let gpu_sks = CudaServerKey::new(&cks, &stream);
+
+                let encrypt_one_value = || {
+                    let clearlow = rng.gen::<u128>();
+                    let clearhigh = rng.gen::<u128>();
+                    let clear_0 = tfhe::integer::I256::from((clearlow, clearhigh));
+                    let ct_0 = cks.encrypt_signed_radix(clear_0, num_block);
+                    let d_ct_0 =
+                        CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct_0, &stream);
+
+                    let clear_1 = rng_func(&mut rng, bit_size) & max_value_for_bit_size;
+
+                    (d_ct_0, clear_1)
+                };
+
+                b.iter_batched(
+                    encrypt_one_value,
+                    |(mut ct_0, clear_1)| {
+                        binary_op(&gpu_sks, &mut ct_0, clear_1, &stream);
+                    },
+                    criterion::BatchSize::SmallInput,
+                )
+            });
+
+            write_to_json::<u64, _>(
+                &bench_id,
+                param,
+                param.name(),
+                display_name,
+                &OperatorType::Atomic,
+                bit_size as u32,
+                vec![param.message_modulus().0.ilog2(); num_block],
+            );
+        }
+
+        bench_group.finish()
+    }
+
+    macro_rules! define_cuda_server_key_bench_clean_input_scalar_signed_fn (
+    (method_name: $server_key_method:ident, display_name:$name:ident, rng_func:$($rng_fn:tt)*) => {
+        ::paste::paste!{
+            fn [<cuda_ $server_key_method>](c: &mut Criterion) {
+                bench_cuda_server_key_binary_scalar_signed_function_clean_inputs(
+                    c,
+                    concat!("integer::cuda::signed::", stringify!($server_key_method)),
+                    stringify!($name),
+                    |server_key, lhs, rhs, stream| {
+                        server_key.$server_key_method(lhs, rhs, stream);
+                    },
+                    $($rng_fn)*
+                )
+            }
+      }
+    }
+  );
+
+    // Functions used to apply different way of selecting a scalar based on the context.
+    fn default_signed_scalar(rng: &mut ThreadRng, _clear_bit_size: usize) -> ScalarType {
+        let clearlow = rng.gen::<u128>();
+        let clearhigh = rng.gen::<u128>();
+        tfhe::integer::I256::from((clearlow, clearhigh))
+    }
+
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: unchecked_add,
+        display_name: add
+    );
+
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: unchecked_sub,
+        display_name: sub
+    );
+
+    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
+        method_name: unchecked_neg,
+        display_name: neg
+    );
+
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: unchecked_mul,
+        display_name: mul
+    );
+
+    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
+        method_name: unchecked_scalar_add,
+        display_name: add,
+        rng_func: default_signed_scalar
+    );
+
+    //===========================================
+    // Default
+    //===========================================
+
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: add,
+        display_name: add
+    );
+
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: sub,
+        display_name: sub
+    );
+
+    define_cuda_server_key_bench_clean_input_signed_unary_fn!(
+        method_name: neg,
+        display_name: neg
+    );
+
+    define_cuda_server_key_bench_clean_input_signed_fn!(
+        method_name: mul,
+        display_name: mul
+    );
+
+    define_cuda_server_key_bench_clean_input_scalar_signed_fn!(
+        method_name: scalar_add,
+        display_name: add,
+        rng_func: default_signed_scalar
+    );
+
+    criterion_group!(
+        unchecked_cuda_ops,
+        cuda_unchecked_add,
+        cuda_unchecked_sub,
+        cuda_unchecked_neg,
+        cuda_unchecked_mul,
+    );
+
+    criterion_group!(unchecked_scalar_cuda_ops, cuda_unchecked_scalar_add,);
+
+    criterion_group!(default_cuda_ops, cuda_add, cuda_sub, cuda_neg, cuda_mul);
+
+    criterion_group!(default_scalar_cuda_ops, cuda_scalar_add);
+}
+
+#[cfg(feature = "gpu")]
+use cuda::{
+    default_cuda_ops, default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops,
+};
+
+#[cfg(feature = "gpu")]
+fn go_through_gpu_bench_groups(val: &str) {
+    match val.to_lowercase().as_str() {
+        "default" => {
+            default_cuda_ops();
+            default_scalar_cuda_ops();
+        }
+        "unchecked" => {
+            unchecked_cuda_ops();
+            unchecked_scalar_cuda_ops();
+        }
+        _ => panic!("unknown benchmark operations flavor"),
+    };
+}
+
+#[allow(dead_code)]
+fn go_through_cpu_bench_groups(val: &str) {
+    match val.to_lowercase().as_str() {
+        "default" => {
+            default_parallelized_ops();
+            default_parallelized_ops_comp();
+            default_scalar_parallelized_ops();
+            default_scalar_parallelized_ops_comp();
+            cast_ops()
+        }
+        "unchecked" => {
+            unchecked_ops();
+            unchecked_ops_comp();
+            unchecked_scalar_ops();
+            unchecked_scalar_ops_comp()
+        }
+        _ => panic!("unknown benchmark operations flavor"),
+    };
+}
+
 fn main() {
    match env::var("__TFHE_RS_BENCH_OP_FLAVOR") {
        Ok(val) => {
-            match val.to_lowercase().as_str() {
-                "default" => {
-                    default_parallelized_ops();
-                    default_parallelized_ops_comp();
-                    default_scalar_parallelized_ops();
-                    default_scalar_parallelized_ops_comp();
-                    cast_ops()
-                }
-                "unchecked" => {
-                    unchecked_ops();
-                    unchecked_ops_comp();
-                    unchecked_scalar_ops();
-                    unchecked_scalar_ops_comp()
-                }
-                _ => panic!("unknown benchmark operations flavor"),
-            };
+            #[cfg(feature = "gpu")]
+            go_through_gpu_bench_groups(&val);
+            #[cfg(not(feature = "gpu"))]
+            go_through_cpu_bench_groups(&val);
        }
        Err(_) => {
            default_parallelized_ops();
+            default_parallelized_ops_comp();
            default_scalar_parallelized_ops();
+            default_scalar_parallelized_ops_comp();
            cast_ops()
        }
    };
--- a/tfhe/benches/keygen/bench.rs
+++ b/tfhe/benches/keygen/bench.rs
@@ -33,7 +33,7 @@ fn criterion_bench(c: &mut Criterion) {
                &glwe_secret_key,
                parameters.pbs_base_log,
                parameters.pbs_level,
-                parameters.glwe_modular_std_dev,
+                parameters.glwe_noise_distribution,
                CiphertextModulus::new_native(),
                &mut encryption_generator,
            );
--- a/tfhe/benches/utilities.rs
+++ b/tfhe/benches/utilities.rs
@@ -14,15 +14,15 @@ pub struct CryptoParametersRecord<Scalar: UnsignedInteger> {
    pub lwe_dimension: Option<LweDimension>,
    pub glwe_dimension: Option<GlweDimension>,
    pub polynomial_size: Option<PolynomialSize>,
-    pub lwe_modular_std_dev: Option<StandardDev>,
-    pub glwe_modular_std_dev: Option<StandardDev>,
+    pub lwe_std_dev: Option<StandardDev>,
+    pub glwe_std_dev: Option<StandardDev>,
    pub pbs_base_log: Option<DecompositionBaseLog>,
    pub pbs_level: Option<DecompositionLevelCount>,
    pub ks_base_log: Option<DecompositionBaseLog>,
    pub ks_level: Option<DecompositionLevelCount>,
    pub pfks_level: Option<DecompositionLevelCount>,
    pub pfks_base_log: Option<DecompositionBaseLog>,
-    pub pfks_modular_std_dev: Option<StandardDev>,
+    pub pfks_std_dev: Option<StandardDev>,
    pub cbs_level: Option<DecompositionLevelCount>,
    pub cbs_base_log: Option<DecompositionBaseLog>,
    pub message_modulus: Option<usize>,
@@ -37,15 +37,15 @@ impl<Scalar: UnsignedInteger> From<BooleanParameters> for CryptoParametersRecord
            lwe_dimension: Some(params.lwe_dimension),
            glwe_dimension: Some(params.glwe_dimension),
            polynomial_size: Some(params.polynomial_size),
-            lwe_modular_std_dev: Some(params.lwe_modular_std_dev),
-            glwe_modular_std_dev: Some(params.glwe_modular_std_dev),
+            lwe_std_dev: Some(params.lwe_noise_distribution.gaussian_std_dev()),
+            glwe_std_dev: Some(params.glwe_noise_distribution.gaussian_std_dev()),
            pbs_base_log: Some(params.pbs_base_log),
            pbs_level: Some(params.pbs_level),
            ks_base_log: Some(params.ks_base_log),
            ks_level: Some(params.ks_level),
            pfks_level: None,
            pfks_base_log: None,
-            pfks_modular_std_dev: None,
+            pfks_std_dev: None,
            cbs_level: None,
            cbs_base_log: None,
            message_modulus: None,
@@ -65,15 +65,15 @@ where
            lwe_dimension: Some(params.lwe_dimension()),
            glwe_dimension: Some(params.glwe_dimension()),
            polynomial_size: Some(params.polynomial_size()),
-            lwe_modular_std_dev: Some(params.lwe_modular_std_dev()),
-            glwe_modular_std_dev: Some(params.glwe_modular_std_dev()),
+            lwe_std_dev: Some(params.lwe_noise_distribution().gaussian_std_dev()),
+            glwe_std_dev: Some(params.glwe_noise_distribution().gaussian_std_dev()),
            pbs_base_log: Some(params.pbs_base_log()),
            pbs_level: Some(params.pbs_level()),
            ks_base_log: Some(params.ks_base_log()),
            ks_level: Some(params.ks_level()),
            pfks_level: None,
            pfks_base_log: None,
-            pfks_modular_std_dev: None,
+            pfks_std_dev: None,
            cbs_level: None,
            cbs_base_log: None,
            message_modulus: Some(params.message_modulus().0),
@@ -97,15 +97,15 @@ impl<Scalar: UnsignedInteger> From<ShortintKeySwitchingParameters>
            lwe_dimension: None,
            glwe_dimension: None,
            polynomial_size: None,
-            lwe_modular_std_dev: None,
-            glwe_modular_std_dev: None,
+            lwe_std_dev: None,
+            glwe_std_dev: None,
            pbs_base_log: None,
            pbs_level: None,
            ks_base_log: Some(params.ks_base_log),
            ks_level: Some(params.ks_level),
            pfks_level: None,
            pfks_base_log: None,
-            pfks_modular_std_dev: None,
+            pfks_std_dev: None,
            cbs_level: None,
            cbs_base_log: None,
            message_modulus: None,
@@ -227,7 +227,8 @@ pub fn write_to_json<
 }

 const FAST_BENCH_BIT_SIZES: [usize; 1] = [32];
-const BENCH_BIT_SIZES: [usize; 7] = [8, 16, 32, 40, 64, 128, 256];
+const BENCH_BIT_SIZES: [usize; 8] = [4, 8, 16, 32, 40, 64, 128, 256];
+const MULTI_BIT_CPU_SIZES: [usize; 6] = [4, 8, 16, 32, 40, 64];

 /// User configuration in which benchmarks must be run.
 #[derive(Default)]
@@ -258,16 +259,14 @@ impl EnvConfig {
    /// Get precisions values to benchmark.
    #[allow(dead_code)]
    pub fn bit_sizes(&self) -> Vec<usize> {
-        if self.is_multi_bit {
-            if self.is_fast_bench {
-                FAST_BENCH_BIT_SIZES.to_vec()
-            } else if cfg!(feature = "gpu") {
+        if self.is_fast_bench {
+            FAST_BENCH_BIT_SIZES.to_vec()
+        } else if self.is_multi_bit {
+            if cfg!(feature = "gpu") {
                BENCH_BIT_SIZES.to_vec()
            } else {
-                vec![8, 16, 32, 40, 64]
+                MULTI_BIT_CPU_SIZES.to_vec()
            }
-        } else if self.is_fast_bench {
-            FAST_BENCH_BIT_SIZES.to_vec()
        } else {
            BENCH_BIT_SIZES.to_vec()
        }
--- a/tfhe/c_api_tests/test_boolean_keygen.c
+++ b/tfhe/c_api_tests/test_boolean_keygen.c
@@ -99,8 +99,8 @@ void test_custom_keygen(void) {
      .lwe_dimension = 10,
      .glwe_dimension = 1,
      .polynomial_size = 1024,
-      .lwe_modular_std_dev = 10e-100,
-      .glwe_modular_std_dev = 10e-100,
+      .lwe_noise_distribution = new_gaussian_from_std_dev(10e-100),
+      .glwe_noise_distribution = new_gaussian_from_std_dev(10e-100),
      .pbs_base_log = 3,
      .pbs_level = 1,
      .ks_base_log = 4,
@@ -113,6 +113,16 @@ void test_custom_keygen(void) {

  boolean_destroy_client_key(cks);
  boolean_destroy_server_key(sks);
+
+  params.lwe_noise_distribution = new_t_uniform(12);
+  params.glwe_noise_distribution = new_t_uniform(8);
+
+  int t_uniform_gen_keys_ok = boolean_gen_keys_with_parameters(params, &cks, &sks);
+
+  assert(t_uniform_gen_keys_ok == 0);
+
+  boolean_destroy_client_key(cks);
+  boolean_destroy_server_key(sks);
 }

 void test_public_keygen(void) {
--- a/tfhe/c_api_tests/test_shortint_keygen.c
+++ b/tfhe/c_api_tests/test_shortint_keygen.c
@@ -101,12 +101,13 @@ void test_server_key_trivial_encrypt(void) {
 void test_custom_keygen(void) {
  ShortintClientKey *cks = NULL;
  ShortintServerKey *sks = NULL;
+
  ShortintPBSParameters params = {
      .lwe_dimension = 10,
      .glwe_dimension = 1,
      .polynomial_size = 1024,
-      .lwe_modular_std_dev = 10e-100,
-      .glwe_modular_std_dev = 10e-100,
+      .lwe_noise_distribution = new_gaussian_from_std_dev(10e-100),
+      .glwe_noise_distribution = new_gaussian_from_std_dev(10e-100),
      .pbs_base_log = 2,
      .pbs_level = 3,
      .ks_base_log = 2,
@@ -123,6 +124,16 @@ void test_custom_keygen(void) {

  shortint_destroy_client_key(cks);
  shortint_destroy_server_key(sks);
+
+  params.lwe_noise_distribution = new_t_uniform(24);
+  params.glwe_noise_distribution = new_t_uniform(16);
+
+  int t_uniform_gen_keys_ok = shortint_gen_keys_with_parameters(params, &cks, &sks);
+
+  assert(t_uniform_gen_keys_ok == 0);
+
+  shortint_destroy_client_key(cks);
+  shortint_destroy_server_key(sks);
 }

 void test_public_keygen(ShortintPBSParameters params) {
--- a/tfhe/docs/core_crypto/presentation.md
+++ b/tfhe/docs/core_crypto/presentation.md
@@ -17,7 +17,8 @@ use tfhe::core_crypto::prelude::*;
 // computations
 // Define parameters for LweCiphertext creation
 let lwe_dimension = LweDimension(742);
-let lwe_modular_std_dev = StandardDev(0.000007069849454709433);
+let lwe_noise_distribution =
+    Gaussian::from_dispersion_parameter(StandardDev(0.000007069849454709433), 0.0);
 let ciphertext_modulus = CiphertextModulus::new_native();

 // Create the PRNG
@@ -43,7 +44,7 @@ encrypt_lwe_ciphertext(
    &lwe_secret_key,
    &mut lwe,
    plaintext,
-    lwe_modular_std_dev,
+    lwe_noise_distribution,
    &mut encryption_generator,
 );

--- a/tfhe/docs/core_crypto/tutorial.md
+++ b/tfhe/docs/core_crypto/tutorial.md
@@ -9,7 +9,7 @@ Welcome to this tutorial about `TFHE-rs` `core_crypto` module.
 To use `TFHE-rs`, it first has to be added as a dependency in the `Cargo.toml`:

 ```toml
-tfhe = { version = "0.5.0", features = [ "x86_64-unix" ] }
+tfhe = { version = "0.6.0", features = [ "x86_64-unix" ] }
 ```

 This enables the `x86_64-unix` feature to have efficient implementations of various algorithms for `x86_64` CPUs on a Unix-like system. The 'unix' suffix indicates that the `UnixSeeder`, which uses `/dev/random` to generate random numbers, is activated as a fallback if no hardware number generator is available (like `rdseed` on `x86_64` or if the [`Randomization Services`](https://developer.apple.com/documentation/security/1399291-secrandomcopybytes?language=objc) on Apple platforms are not available). To avoid having the `UnixSeeder` as a potential fallback or to run on non-Unix systems (e.g., Windows), the `x86_64` feature is sufficient.
@@ -19,19 +19,19 @@ For Apple Silicon, the `aarch64-unix` or `aarch64` feature should be enabled. `a
 In short: For `x86_64`-based machines running Unix-like OSes:

 ```toml
-tfhe = { version = "0.5.0", features = ["x86_64-unix"] }
+tfhe = { version = "0.6.0", features = ["x86_64-unix"] }
 ```

 For Apple Silicon or aarch64-based machines running Unix-like OSes:

 ```toml
-tfhe = { version = "0.5.0", features = ["aarch64-unix"] }
+tfhe = { version = "0.6.0", features = ["aarch64-unix"] }
 ```

 For `x86_64`-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:

 ```toml
-tfhe = { version = "0.5.0", features = ["x86_64"] }
+tfhe = { version = "0.6.0", features = ["x86_64"] }
 ```

 ### Commented code to double a 2-bit message in a leveled fashion and using a PBS with the `core_crypto` module.
@@ -48,8 +48,10 @@ pub fn main() {
    let small_lwe_dimension = LweDimension(742);
    let glwe_dimension = GlweDimension(1);
    let polynomial_size = PolynomialSize(2048);
-    let lwe_modular_std_dev = StandardDev(0.000007069849454709433);
-    let glwe_modular_std_dev = StandardDev(0.00000000000000029403601535432533);
+    let lwe_noise_distribution =
+        Gaussian::from_dispersion_parameter(StandardDev(0.000007069849454709433), 0.0);
+    let glwe_noise_distribution =
+        Gaussian::from_dispersion_parameter(StandardDev(0.00000000000000029403601535432533), 0.0);
    let pbs_base_log = DecompositionBaseLog(23);
    let pbs_level = DecompositionLevelCount(1);
    let ciphertext_modulus = CiphertextModulus::new_native();
@@ -88,7 +90,7 @@ pub fn main() {
        &glwe_sk,
        pbs_base_log,
        pbs_level,
-        glwe_modular_std_dev,
+        glwe_noise_distribution,
        ciphertext_modulus,
        &mut encryption_generator,
    );
@@ -124,7 +126,7 @@ pub fn main() {
    let lwe_ciphertext_in: LweCiphertextOwned<u64> = allocate_and_encrypt_new_lwe_ciphertext(
        &small_lwe_sk,
        plaintext,
-        lwe_modular_std_dev,
+        lwe_noise_distribution,
        ciphertext_modulus,
        &mut encryption_generator,
    );
--- a/tfhe/docs/fine_grained_api/Boolean/parameters.md
+++ b/tfhe/docs/fine_grained_api/Boolean/parameters.md
@@ -32,8 +32,12 @@ fn main() {
            LweDimension(586),
            GlweDimension(2),
            PolynomialSize(512),
-            StandardDev(0.00008976167396834998),
-            StandardDev(0.00000002989040792967434),
+            DynamicDistribution::new_gaussian_from_std_dev(
+                StandardDev(0.00008976167396834998),
+            ),
+            DynamicDistribution::new_gaussian_from_std_dev(
+                StandardDev(0.00000002989040792967434),
+            ),
            DecompositionBaseLog(8),
            DecompositionLevelCount(2),
            DecompositionBaseLog(2),
--- a/tfhe/docs/fine_grained_api/shortint/parameters.md
+++ b/tfhe/docs/fine_grained_api/shortint/parameters.md
@@ -52,6 +52,7 @@ For instance:

 ```rust
 use tfhe::shortint::prelude::*;
+use tfhe::shortint::parameters::DynamicDistribution;

 fn main() {
    let param = unsafe {
@@ -59,8 +60,12 @@ fn main() {
            LweDimension(656),
            GlweDimension(2),
            PolynomialSize(512),
-            StandardDev(0.000034119201269311964),
-            StandardDev(0.00000004053919869756513),
+            DynamicDistribution::new_gaussian_from_std_dev(
+                StandardDev(0.000034119201269311964),
+            ),
+            DynamicDistribution::new_gaussian_from_std_dev(
+                StandardDev(0.00000004053919869756513),
+            ),
            DecompositionBaseLog(8),
            DecompositionLevelCount(2),
            DecompositionBaseLog(3),
--- a/tfhe/docs/getting_started/installation.md
+++ b/tfhe/docs/getting_started/installation.md
@@ -8,12 +8,12 @@ To use `TFHE-rs` in your project, you first need to add it as a dependency in yo

 If you are using an `x86_64` machine running a Unix-like OS:
 ```toml
-tfhe = { version = "0.5.0", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
+tfhe = { version = "0.6.0", features = [ "boolean", "shortint", "integer", "x86_64-unix" ] }
 ```

 If you are using an `ARM` machine running a Unix-like OS:
 ```toml
-tfhe = { version = "0.5.0", features = [ "boolean", "shortint", "integer", "aarch64-unix" ] }
+tfhe = { version = "0.6.0", features = [ "boolean", "shortint", "integer", "aarch64-unix" ] }
 ```

 If you are using an `x86_64` machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
@@ -24,7 +24,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"]


 {% hint style="info" %}
-You need to use a Rust version >= 1.72 to compile TFHE-rs.
+You need to use a Rust version >= 1.73 to compile TFHE-rs.
 {% endhint %}

 {% hint style="success" %}
--- a/tfhe/docs/getting_started/quick_start.md
+++ b/tfhe/docs/getting_started/quick_start.md
@@ -44,7 +44,7 @@ fn main() {

 The default configuration for x86 Unix machines:
 ```toml
-tfhe = { version = "0.5.0", features = ["integer", "x86_64-unix"]}
+tfhe = { version = "0.6.0", features = ["integer", "x86_64-unix"]}
 ```

 Configuration options for different platforms can be found [here](../getting_started/installation.md). Other rust and homomorphic types features can be found [here](../how_to/rust_configuration.md).
--- a/tfhe/docs/how_to/migrate_data.md
+++ b/tfhe/docs/how_to/migrate_data.md
@@ -1,3 +1,3 @@
-# Migrating Data to TFHE-rs 0.5.0 (This Release)
+# Migrating Data to TFHE-rs 0.6.0 (This Release)

-Forward compatibility code to migrate data from TFHE-rs 0.4 to TFHE-rs 0.5 has been added in a minor release of TFHE-rs 0.4, the documentation about the process can be found [here](https://docs.zama.ai/tfhe-rs/v/0.4-1/how-to/migrate_data).
+Forward compatibility code to migrate data from TFHE-rs 0.5 to TFHE-rs 0.6 has been added in a minor release of TFHE-rs 0.5, the documentation about the process can be found [here](https://docs.zama.ai/tfhe-rs/v/0.5-1/how-to/migrate_data).
--- a/tfhe/docs/how_to/run_on_gpu.md
+++ b/tfhe/docs/how_to/run_on_gpu.md
@@ -13,12 +13,12 @@ To use the `TFHE-rs GPU backend` in your project, you first need to add it as a

 If you are using an `x86` machine:
 ```toml
-tfhe = { version = "0.5.0", features = [ "boolean", "shortint", "integer", "x86_64-unix", "gpu" ] }
+tfhe = { version = "0.6.0", features = [ "boolean", "shortint", "integer", "x86_64-unix", "gpu" ] }
 ```

 If you are using an `ARM` machine:
 ```toml
-tfhe = { version = "0.5.0", features = [ "boolean", "shortint", "integer", "aarch64-unix", "gpu" ] }
+tfhe = { version = "0.6.0", features = [ "boolean", "shortint", "integer", "aarch64-unix", "gpu" ] }
 ```


--- a/tfhe/docs/how_to/rust_configuration.md
+++ b/tfhe/docs/how_to/rust_configuration.md
@@ -1,6 +1,6 @@
 # Using the right toolchain for TFHE-rs.

-TFHE-rs only requires a nightly toolchain for building the C API and using advanced SIMD instructions, otherwise you can use a stable toolchain (with version >= 1.72)
+TFHE-rs only requires a nightly toolchain for building the C API and using advanced SIMD instructions, otherwise you can use a stable toolchain (with version >= 1.73)
 Install the needed Rust toolchain:

 ```shell
--- a/tfhe/docs/how_to/serialization.md
+++ b/tfhe/docs/how_to/serialization.md
@@ -11,7 +11,7 @@ To serialize our data, a [data format](https://serde.rs/#data-formats) should be

 [dependencies]
 # ...
-tfhe = { version = "0.5.0", features = ["integer","x86_64-unix"]}
+tfhe = { version = "0.6.0", features = ["integer","x86_64-unix"]}
 bincode = "1.3.3"
 ```

--- a/tfhe/docs/tutorials/ascii_fhe_string.md
+++ b/tfhe/docs/tutorials/ascii_fhe_string.md
@@ -24,7 +24,7 @@ To use the `FheUint8` type, the `integer` feature must be activated:

 [dependencies]
 # Default configuration for x86 Unix machines:
-tfhe = { version = "0.5.0", features = ["integer", "x86_64-unix"]}
+tfhe = { version = "0.6.0", features = ["integer", "x86_64-unix"]}
 ```

 Other configurations can be found [here](../getting_started/installation.md).
--- a/tfhe/docs/tutorials/parity_bit.md
+++ b/tfhe/docs/tutorials/parity_bit.md
@@ -19,7 +19,7 @@ This function returns a Boolean that will be either `true` or `false` so that th
 # Cargo.toml

 # Default configuration for x86 Unix machines:
-tfhe = { version = "0.5.0", features = ["integer", "x86_64-unix"]}
+tfhe = { version = "0.6.0", features = ["integer", "x86_64-unix"]}
 ```

 Other configurations can be found [here](../getting_started/installation.md).
--- a/tfhe/examples/dist_tuniform.rs
+++ b/tfhe/examples/dist_tuniform.rs
@@ -0,0 +1,32 @@
+use tfhe::prelude::*;
+use tfhe::shortint::parameters::DynamicDistribution;
+use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheUint32};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let mut my_params = tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
+    // DISCLAIMER: This is not guaranteed to be secure, thorough noise and security analysis are
+    // required by the end user
+    // This is only to demonstrate that one can use custom noise distribution if they want to
+    my_params.lwe_noise_distribution = DynamicDistribution::new_t_uniform(20);
+    my_params.glwe_noise_distribution = DynamicDistribution::new_t_uniform(10);
+
+    let config = ConfigBuilder::default()
+        .use_custom_parameters(my_params, None)
+        .build();
+
+    let (keys, server_keys) = generate_keys(config);
+    set_server_key(server_keys);
+
+    let clear_a = 673u32;
+    let clear_b = 6u32;
+    let a = FheUint32::try_encrypt(clear_a, &keys)?;
+    let b = FheUint32::try_encrypt(clear_b, &keys)?;
+
+    let c = &a >> &b;
+    let decrypted: u32 = c.decrypt(&keys);
+    assert_eq!(decrypted, clear_a >> clear_b);
+
+    println!("decrypted = {decrypted} = {clear_a} >>  {clear_b}");
+
+    Ok(())
+}
--- a/tfhe/examples/pbs_count.rs
+++ b/tfhe/examples/pbs_count.rs
@@ -1,7 +1,9 @@
 use tfhe::prelude::*;
 use tfhe::*;

+
 pub fn main() {
+
    let config = ConfigBuilder::default().build();

    let (cks, sks) = generate_keys(config);
@@ -11,19 +13,163 @@ pub fn main() {

    set_server_key(sks);

+
+    // Negation
+    let c = -&a;
+    let neg_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Add / Sub
+    let c = &a + &b;
+    let add_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Mul
    let c = &a * &b;
    let mul_32_count = get_pbs_count();
-
    reset_pbs_count();
-    let d = &a & &b;
+
+    // Equal / Not Equal
+    let c = &a.eq(&b);
+    let eq_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Comparisons
+    let c = &a.gt(&b);
+    let gt_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Max / Min
+    let c = &a.max(&b);
+    let max_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Bitwise operations
+    let c = &a & &b;
    let and_32_count = get_pbs_count();
+    reset_pbs_count();

+    //         Div / Rem
+    let c = &a % &b;
+    let mod_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Left / Right Shifts
+    let c = &a << &b;
+    let shift_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    //    Left / Right Rotations
+    let c = &a.rotate_right(&b);
+    let rotate_32_count = get_pbs_count();
+    reset_pbs_count();
+
+    println!("neg_32_count: {neg_32_count}");
+    println!("add_32_count: {add_32_count}");
    println!("mul_32_count: {mul_32_count}");
+    println!("eq_32_count: {eq_32_count}");
+    println!("gt_32_count: {gt_32_count}");
+    println!("max_32_count: {max_32_count}");
    println!("and_32_count: {and_32_count}");
+    println!("mod_32_count: {mod_32_count}");
+    println!("shift_32_count: {shift_32_count}");
+    println!("and_32_count: {rotate_32_count}");

-    let c_dec: u32 = c.decrypt(&cks);
-    let d_dec: u32 = d.decrypt(&cks);

-    assert_eq!(42 * 69, c_dec);
-    assert_eq!(42 & 69, d_dec);
+
+     let config = ConfigBuilder::default().build();
+
+    let (cks, sks) = generate_keys(config);
+
+    let a = FheUint64::encrypt(42u64, &cks);
+    let b = FheUint64::encrypt(69u64, &cks);
+
+    set_server_key(sks);
+
+
+    // Negation
+    let c = -&a;
+    let neg_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Add / Sub
+    let c = &a + &b;
+    let add_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Mul
+    let c = &a * &b;
+    let mul_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Equal / Not Equal
+    let c = &a.eq(&b);
+    let eq_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Comparisons
+    let c = &a.gt(&b);
+    let gt_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Max / Min
+    let c = &a.max(&b);
+    let max_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Bitwise operations
+    let c = &a & &b;
+    let and_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    //         Div / Rem
+    let c = &a % &b;
+    let mod_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    // Left / Right Shifts
+    let c = &a << &b;
+    let shift_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    //    Left / Right Rotations
+    let c = &a.rotate_right(&b);
+    let rotate_64_count = get_pbs_count();
+    reset_pbs_count();
+
+    println!("neg_64_count: {neg_64_count}");
+    println!("add_64_count: {add_64_count}");
+    println!("mul_64_count: {mul_64_count}");
+    println!("eq_64_count: {eq_64_count}");
+    println!("gt_64_count: {gt_64_count}");
+    println!("max_64_count: {max_64_count}");
+    println!("and_64_count: {and_64_count}");
+    println!("mod_64_count: {mod_64_count}");
+    println!("shift_64_count: {shift_64_count}");
+    println!("and_64_count: {rotate_64_count}");
+
+
+
+
+    assert!(false);
 }
+
+
+// pub fn count_all_pbs(){
+
+
+
+//     let (cks, sks) = generate_keys(config);
+
+//     let a = FheUint32::encrypt(42, &cks);
+//     let b = FheUint32::encrypt(69, &cks);
+
+//     set_server_key(sks);
+
+//     let c = &a * &b;
+//     let mul_32_count = get_pbs_count();
+
+//     reset_pbs_count();
+//     let d = &a & &b;
+//     let and_32_count = get_pbs_count();
+// }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
J-B Orfila	d0937aae20	pbs count	2024-03-20 18:28:02 +01:00
Arthur Meyre	e81152a630	chore(tfhe): remove last remaining modular_std_dev - some places were not updated, remove the last non modular std_dev - the ones to dump parameters are modular so are kept	2024-03-12 11:12:40 +01:00
Pedro Alves	8c4675dc3e	fix(gpu): fix a bug in integer multiplication	2024-03-12 09:57:39 +01:00
Pedro Alves	29fb4fbe77	chore(gpu): refactor low-latency and multi-bit PBSs so the buffer is a structured object	2024-03-12 09:57:39 +01:00
Agnes Leroy	f84c34c903	feat(gpu): signed scalar add	2024-03-11 14:49:39 +01:00
dependabot[bot]	cc905a04c7	chore(deps): bump tj-actions/changed-files from 42.0.5 to 42.1.0 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.5 to 42.1.0. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`800a282599...aa08304bd4`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2024-03-11 09:30:41 +01:00
Arthur Meyre	3fc791e813	chore(ci): to avoid stack overlow crashes increase thread stack size - Default Linux thread stack size seems to be 8 MB, rust limits it to 2 MB by default, change that to avoid tests failing because of overflowed stacks	2024-03-08 19:49:22 +01:00
Pedro Alves	d4f8fb8f57	feat(gpu): Implement benchmark for low latency and amortized PBS in all variants and the FFT	2024-03-08 14:04:53 -03:00
Pedro Alves	68ce43d2f0	feat(gpu): implement custom benchmarks	2024-03-08 14:04:53 -03:00
Arthur Meyre	c5b9e5400a	chore(ci): make sure dev_bench is checked by clippy - removed the experimental feature requirement	2024-03-08 10:56:36 +01:00
David Testé	8167c85764	chore(bench): reduce measurement duration to 60 for pbs benchmarks This is done to speed-up benchmark duration.	2024-03-08 09:16:17 +01:00
tmontaigu	98bd45503c	chore(hlapi): add some GPU test for FheUint Tests are not complete yet, but its the first step to get there	2024-03-07 20:08:11 +01:00
Agnes Leroy	ed50042719	feat(gpu): signed mul with tests and benchmarks	2024-03-07 15:37:52 +01:00
David Testé	053d56a3d6	chore(ci): format benchmark results parser with black	2024-03-07 13:33:46 +01:00
David Testé	e5b117ca29	chore(ci): handle new name format to get pbs throughput values core_crypto benchmark name format has been changed to reflect what's used in other layers. Benchmark result parser was no longer able to compute the right value for the PBS throughput.	2024-03-07 13:33:46 +01:00
tmontaigu	9de486f33c	chore(integer): move & hardden sub/neg tests Also start making non parallel test use test cases	2024-03-07 10:38:27 +01:00
Arthur Meyre	ccf879c9ae	refactor(tfhe): plug NoiseDistribution in the various APIs	2024-03-07 10:24:15 +01:00
Mayeul@Zama	d3c1f91948	test(shortint): add oprf deterministic test	2024-03-06 17:19:05 +01:00
Arthur Meyre	273dbe1b85	chore(core): make torus_modular_diff safer to use	2024-03-06 15:54:06 +01:00
Agnes Leroy	7ac061266f	feat(gpu): signed sub and neg with tests and benchmarks Refactor tests in the meanwhile to avoid huge tests files.	2024-03-06 15:53:51 +01:00
Agnes Leroy	c1c56ab770	fix(gpu): fix memory bug in multi-bit PBS	2024-03-06 14:18:29 +01:00
Pedro Alves	00dad37812	chore(gpu): replace recomended lwe_chunk_size for NVIDIA Tesla H100 GPUs	2024-03-06 07:10:22 -03:00
Arthur Meyre	f94533d70d	chore(ci): fix CUDA_PATH bin not being exported in GITHUB_PATH	2024-03-06 09:22:45 +01:00
David Testé	b7d7e68d0c	chore(ci): run static linter on workflows	2024-03-05 15:00:09 +01:00
David Testé	e8135c207d	chore(ci): fix lint errors in workflows	2024-03-05 15:00:09 +01:00
Arthur Meyre	601b200351	chore(ci): fix workflows, missing leading $, skipped does not exist - avoid spamming if cancelled	2024-03-04 18:19:46 +01:00
Arthur Meyre	a0d5bf2fc2	feat(core): switch GLWE primitives to the new noise distribution system	2024-03-04 15:01:25 +01:00
dependabot[bot]	58223dea09	chore(deps): bump tj-actions/changed-files from 42.0.4 to 42.0.5 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.4 to 42.0.5. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`3f54ebb830...800a282599`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-03-04 09:59:53 +01:00
dependabot[bot]	1f3096b743	chore(deps): bump codecov/codecov-action from 3.1.5 to 4.1.0 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3.1.5 to 4.1.0. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v3.1.5...54bcd8715eee62d40e33596ef5e8f0f48dbbccab) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2024-03-04 09:59:15 +01:00
Agnes Leroy	4a3d5d432a	chore(gpu): fix integer bench workflows	2024-03-04 09:30:10 +01:00
Agnes Leroy	c6bfcd75a6	chore(gpu): add pbs throughput benchmarks	2024-03-04 09:30:10 +01:00
Agnes Leroy	85dfd70c6b	chore(bench): fully load the cpu for throughput benches	2024-03-04 09:30:10 +01:00
Agnes Leroy	c720656340	chore(gpu): bench signed add on gpu	2024-03-04 09:26:34 +01:00
Agnes Leroy	1c209403a6	feat(gpu): signed addition	2024-03-04 09:26:34 +01:00
tmontaigu	347fc9aaa7	chore(hlapi): add cuda tests for FheBool	2024-03-01 17:17:37 +01:00
tmontaigu	198485b5fb	feat(hlapi): bin cuda scalar_eq/ne on FheBool	2024-03-01 17:17:37 +01:00
David Testé	bd7547c93d	chore(bench): benchmark 4 bits integer operations	2024-03-01 14:39:58 +01:00
Arthur Meyre	955495d714	refactor(core): change layout of compact public key encryption for LWE list - this makes sure the product computed for the first ciphertext matches the product computed for a single ciphertext in the non list case BREAKING CHANGE: all previous compact public key list encryptions are not compatible with the new layout	2024-03-01 11:05:04 +01:00
David Testé	902755c33c	feat(core_crypto): add parallelized pfpks with lwe ciphertext list	2024-02-29 18:05:57 +01:00
Arthur Meyre	89f845fa4f	refactor(tfhe): use dynamic noise distributions for LWE primitives	2024-02-29 18:05:12 +01:00
Arthur Meyre	9f89d2c09d	chore(core): lighten the bound to be generable from a Gaussian distribution	2024-02-29 18:05:12 +01:00
Arthur Meyre	ea0d146ed0	chore(core): add missing unsigned integer slice add noise primitives	2024-02-29 18:05:12 +01:00
tmontaigu	943ccdf450	chore(integer): harden unsigned add tests This adds degrees and noise levels checks as well as comparing individual decrypted block values with their degrees.	2024-02-29 17:24:25 +01:00
tmontaigu	f39896ac63	refactor(integer): start refactoring tests This starts splitting the long test radix tests files into smaller ones, starting with the add family of function.	2024-02-29 17:24:25 +01:00
Pedro Alves	46a87c6f89	fix(gpu): fix scalar eq for booleans	2024-02-29 11:51:49 +01:00
David Testé	a5579532be	chore(ci): add product cost for rtx4090 to compute throughput RTX4090 we're using here is owned by Zama. So we don't pay an hourly rate to AWS per se. But in ordrer to compute throughput on benchmarks results, the parser needs a numeric value corresponding to the hardware used. Ops-per-dollar metric is not really used today conversely ops-per-seconds is. In the end we use an approximation of the cost for electrical consumption.	2024-02-28 15:53:08 +01:00
Agnes Leroy	41e1781226	chore(gpu): move ciphertext info to dedicated file	2024-02-28 09:02:36 +01:00
Agnes Leroy	697ce94ee2	chore(gpu): remove duplicated test params	2024-02-28 09:02:21 +01:00
Arthur Meyre	a667b654ef	chore(tfhe): use div_ceil now that MSRV is 1.73	2024-02-27 18:35:54 +01:00
Arthur Meyre	1bff07b6eb	chore(tfhe): update rust MSRV to 1.73	2024-02-27 18:35:54 +01:00
David Testé	59664e84c8	chore(bench): format core_crypto benchmark names to ease parsing	2024-02-27 18:05:35 +01:00
Agnes Leroy	79dc101728	chore(gpu): fix 4090 bench workflow	2024-02-27 17:46:20 +01:00
Arthur Meyre	6828438898	chore(tfhe): bump version to 0.6.0	2024-02-27 13:24:10 +01:00
Arthur Meyre	a8f4cf7c29	chore(cuda): bump backend version to 0.2.0	2024-02-27 13:24:10 +01:00
David Testé	30d2f5f66d	chore(ci): add coverage build make recipe	2024-02-27 09:29:03 +01:00
David Testé	112cc6f6c9	chore(ci): remove private feature __coverage to use tarpaulin cfg	2024-02-27 09:29:03 +01:00
David Testé	93581f7ee1	chore(ci): add integer layer to code coverage A special set of cryptographic parameters set have been created to speed-up test execution in coverage mode. These parameters set are NOT guaranteed to be secure nor to yield correct results.	2024-02-27 09:29:03 +01:00
David Testé	6e08e91109	chore(ci): checkout repo with fetch-depth 0 to get commit hash The COMMIT_HASH computed variable needs fetch-depth=0 to be able to get the versions of the repository.	2024-02-27 08:50:38 +01:00
Agnes Leroy	75f0ad1d4b	chore(gpu): add core crypto benches to 4090 bench workflow	2024-02-27 08:50:38 +01:00
Arthur Meyre	618758bd95	fix(core): fix unsigned noise addition for custom modulus	2024-02-26 22:19:01 +01:00
Arthur Meyre	d770a271b3	chore(core): add custom power of 2 support for u128	2024-02-26 22:19:01 +01:00
David Testé	80468494b2	chore(ci): lock version of lattice-estimator in workflow Latest version of lattice-estimator produce overflow errors. We force the checkout to the last working version to avoid a red CI.	2024-02-26 22:18:06 +01:00
Pedro Alves	26e5af542f	feat(gpu): Reintroduce a tool to independently test PBS, Keyswitch, and fft at C++ side.	2024-02-26 13:44:32 -03:00
Arthur Meyre	f23b4f21dc	chore(core): remove the possibility to seed the NoiseRandomGenerator - to further avoid misuse, now the NoiseRandomGenerator itself requires a seeder - removed the possibility to re-seed the noise generator, even in tests, we now have access to deterministic seeders which did not use to be the case	2024-02-26 13:28:17 +01:00
Arthur Meyre	b394da3dbb	chore(tfhe): remove unused distributions	2024-02-26 13:28:17 +01:00
Arthur Meyre	6007cd2c81	chore(core): refactor byte counts for runtime noise distribution choice - we will want to be able to choose a noise distribution at runtime and not keep a hard coded gaussian, we therefore need to be able to adapt to the number of bytes a distribution may require to properly generate a sample	2024-02-26 13:28:17 +01:00
Arthur Meyre	a6fdc46794	chore(core): rename Encryption RNG primitives to match noise distribution - we are shifting to non hardcoded noise distributions for encryption, rename functions for mask and noise generation to indicate which hard coded distribution was used initially	2024-02-26 13:28:17 +01:00
dependabot[bot]	0134a4a0f2	chore(deps): bump codecov/codecov-action from 4.0.1 to 4.0.2 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 4.0.1 to 4.0.2. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`e0b68c6749...0cfda1dd0a`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-02-26 13:27:24 +01:00
dependabot[bot]	68dfd96993	chore(deps): bump tj-actions/changed-files from 42.0.3 to 42.0.4 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.3 to 42.0.4. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`ec75ae5ab7...3f54ebb830`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2024-02-26 13:27:07 +01:00
David Testé	6811177178	chore(ci): fix missing backslash in rtx benchmark workflow This missing backslash causes the Python command to fail since some input arguments are missing.	2024-02-26 09:32:02 +01:00
Pedro Alves	753c7aa0d2	chore(gpu): minor improvement on the LUT generation function and in are_all_comparisons_block_true()	2024-02-24 08:49:59 +01:00
tmontaigu	f38a9a9b4c	feat(integer): add ilog2 and checked_ilog2	2024-02-23 18:55:34 +01:00