GITBOOK-5: Update TOC

GITBOOK-4: V2 design details
GITBOOK-3: correct a typo
2026-01-12 16:18:52 -05:00 · 2024-03-05 14:32:28 +00:00 · 2024-02-28 15:27:05 +00:00 · 2024-02-28 14:54:38 +00:00 · 2024-02-28 14:23:50 +00:00 · 2024-02-28 14:11:06 +00:00
408 changed files with 58718 additions and 5984 deletions
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -5,6 +5,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run concrete-csprng tests
        run: |
--- a/.github/workflows/aws_tfhe_gpu_tests.yml
+++ b/.github/workflows/aws_tfhe_gpu_tests.yml
@@ -0,0 +1,112 @@
+# Compile and test Concrete-cuda on an AWS instance
+name: Concrete Cuda - Full tests
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+    # All the inputs are provided by Slab
+    inputs:
+      instance_id:
+        description: "AWS instance ID"
+        type: string
+      instance_image_id:
+        description: "AWS instance AMI ID"
+        type: string
+      instance_type:
+        description: "AWS instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: 'Slab request ID'
+        type: string
+      fork_repo:
+        description: 'Name of forked repo as user/repo'
+        type: string
+      fork_git_sha:
+        description: 'Git SHA to checkout from fork'
+        type: string
+
+jobs:
+  run-cuda-tests-linux:
+    concurrency:
+      group: tfhe_cuda_backend_test-${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    name: Test code in EC2
+    runs-on: ${{ inputs.runner_name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+
+    steps:
+      # Step used for log purpose.
+      - name: Instance configuration used
+        run: |
+          echo "ID: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+          echo "Fork repo: ${{ inputs.fork_repo }}"
+          echo "Fork git sha: ${{ inputs.fork_git_sha }}"
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: ${{ inputs.fork_repo }}
+          ref: ${{ inputs.fork_git_sha }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: stable
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Run clippy checks
+        run: |
+          make clippy_gpu
+
+      - name: Run all tests
+        run: |
+          make test_gpu
+
+      - name: Run user docs tests
+        run: |
+          make test_user_doc_gpu
+
+      - name: Test C API
+        run: |
+          make test_c_api_gpu
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Gen Keys if required
        run: |
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Gen Keys if required
        run: |
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run concrete-csprng tests
        run: |
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -60,10 +61,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run js on wasm API tests
        run: |
--- a/.github/workflows/boolean_benchmark.yml
+++ b/.github/workflows/boolean_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-boolean-benchmarks:
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -96,7 +96,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -6,6 +6,7 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -17,7 +18,7 @@ jobs:

    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ubuntu-latest, macos-latest-large, windows-latest]
      fail-fast: false

    steps:
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -61,14 +62,13 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@1c938490c880156b746568a518594309cfb3f66b
+        uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
        with:
          files_yaml: |
            tfhe:
@@ -98,7 +98,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
+        uses: codecov/codecov-action@4fe8c5f003fae66aa5ebb77cfd3e7bfbbda0b6b0
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/csprng_randomness_testing.yml
+++ b/.github/workflows/csprng_randomness_testing.yml
@@ -4,6 +4,7 @@ env:
  CARGO_TERM_COLOR: always
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -52,10 +53,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Dieharder randomness test suite
        run: |
--- a/.github/workflows/integer_benchmark.yml
+++ b/.github/workflows/integer_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_full_benchmark.yml
+++ b/.github/workflows/integer_full_benchmark.yml
@@ -28,6 +28,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  prepare-matrix:
@@ -39,15 +40,12 @@ jobs:
      - name: Weekly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\", \"default_comp\", \"default_scalar\", \"default_scalar_comp\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}

      - name: Quarterly benchmarks
        if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
        run: |
-          echo "OP_FLAVOR=[\"default\", \"default_comp\", \"default_scalar\", \"default_scalar_comp\", \
-          \"smart\", \"smart_comp\", \"smart_scalar\", \"smart_parallelized\", \"smart_parallelized_comp\", \"smart_scalar_parallelized\", \"smart_scalar_parallelized_comp\", \
-          \"unchecked\", \"unchecked_comp\", \"unchecked_scalar\", \"unchecked_scalar_comp\", \
-          \"misc\"]" >> ${GITHUB_ENV}
+          echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}

      -  name: Set operation flavor output
         id: set_op_flavor
@@ -60,6 +58,7 @@ jobs:
    runs-on: ${{ github.event.inputs.runner_name }}
    if: ${{ !cancelled() }}
    continue-on-error: true
+    timeout-minutes: 1440  # 24 hours
    strategy:
      max-parallel: 1
      matrix:
@@ -90,10 +89,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -120,7 +118,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_gpu_benchmark.yml
+++ b/.github/workflows/integer_gpu_benchmark.yml
@@ -0,0 +1,157 @@
+# Run integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+name: Integer GPU benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+
+jobs:
+  run-integer-benchmarks:
+    name: Execute integer benchmarks in EC2
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    steps:
+      - name: Instance configuration used
+        run: |
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+
+      - name: Get benchmark date
+        run: |
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
+
+      - name: Parse benchmarks to csv
+        run: |
+          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
+            parse_integer_benches
+
+      - name: Upload csv results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_csv_integer
+          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
+
+      - name: Parse results
+        run: |
+          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
+          COMMIT_HASH="$(git describe --tags --dirty)"
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware ${{ inputs.instance_type }} \
+          --backend gpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_integer
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_gpu_full_benchmark.yml
@@ -0,0 +1,162 @@
+# Run all integer benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+name: Integer GPU full benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+      # This input is not used in this workflow but still mandatory since a calling workflow could
+      # use it. If a triggering command include a user_inputs field, then the triggered workflow
+      # must include this very input, otherwise the workflow won't be called.
+      # See start_full_benchmarks.yml as example.
+      user_inputs:
+        description: "Type of benchmarks to run"
+        type: string
+        default: "weekly_benchmarks"
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+
+jobs:
+  integer-benchmarks:
+    name: Execute integer benchmarks for all operations flavor
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        command: [ integer, integer_multi_bit]
+        op_flavor: [ default, unchecked ]
+        # explicit include-based build matrix, of known valid options
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    steps:
+      - name: Instance configuration used
+        run: |
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Get benchmark details
+        run: |
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+          echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
+          echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware ${{ inputs.instance_type }} \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+  slack-notification:
+    name: Slack Notification
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ failure() }}
+    needs: integer-benchmarks
+    steps:
+      - name: Notify
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer GPU full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/integer_multi_bit_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run multi-bit benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/integer_multi_bit_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_gpu_benchmark.yml
@@ -0,0 +1,158 @@
+# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
+name: Integer Multi-bit benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+
+jobs:
+  run-integer-benchmarks:
+    name: Execute integer multi-bit benchmarks in EC2
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "11.8"
+            cuda_arch: "70"
+            gcc: 9
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+    steps:
+      - name: Instance configuration used
+        run: |
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+
+      - name: Get benchmark date
+        run: |
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Run multi-bit benchmarks with AVX512
+        run: |
+          make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
+
+      - name: Parse benchmarks to csv
+        run: |
+          make PARSE_INTEGER_BENCH_CSV_FILE=${{ env.PARSE_INTEGER_BENCH_CSV_FILE }} \
+            parse_integer_benches
+
+      - name: Upload csv results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_csv_integer
+          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
+
+      - name: Parse results
+        run: |
+          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
+          COMMIT_HASH="$(git describe --tags --dirty)"
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware ${{ inputs.instance_type }} \
+          --backend gpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512 \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_integer
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "Integer GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -14,6 +14,7 @@ on:
 env:
  CARGO_TERM_COLOR: always
  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  FAST_TESTS: "TRUE"

@@ -30,10 +31,9 @@ jobs:
      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11

      - name: Install latest stable
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: stable
-          default: true

      - name: Run pcc checks
        run: |
--- a/.github/workflows/pbs_benchmark.yml
+++ b/.github/workflows/pbs_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-pbs-benchmarks:
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -86,7 +86,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_pbs
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/pbs_gpu_benchmark.yml
+++ b/.github/workflows/pbs_gpu_benchmark.yml
@@ -0,0 +1,142 @@
+# Run PBS benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
+name: PBS GPU benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      instance_id:
+        description: "Instance ID"
+        type: string
+      instance_image_id:
+        description: "Instance AMI ID"
+        type: string
+      instance_type:
+        description: "Instance product type"
+        type: string
+      runner_name:
+        description: "Action runner name"
+        type: string
+      request_id:
+        description: "Slab request ID"
+        type: string
+      # This input is not used in this workflow but still mandatory since a calling workflow could
+      # use it. If a triggering command include a user_inputs field, then the triggered workflow
+      # must include this very input, otherwise the workflow won't be called.
+      # See start_full_benchmarks.yml as example.
+      user_inputs:
+        description: "Type of benchmarks to run"
+        type: string
+        default: "weekly_benchmarks"
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+jobs:
+  run-pbs-benchmarks:
+    name: Execute PBS benchmarks in EC2
+    runs-on: ${{ github.event.inputs.runner_name }}
+    if: ${{ !cancelled() }}
+    steps:
+      - name: Instance configuration used
+        run: |
+          echo "IDs: ${{ inputs.instance_id }}"
+          echo "AMI: ${{ inputs.instance_image_id }}"
+          echo "Type: ${{ inputs.instance_type }}"
+          echo "Request ID: ${{ inputs.request_id }}"
+
+      - name: Get benchmark date
+        run: |
+          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          fetch-depth: 0
+
+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
+        with:
+          toolchain: nightly
+
+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
+      - name: Run benchmarks with AVX512
+        run: |
+          make AVX512_SUPPORT=ON bench_pbs_gpu
+
+      - name: Parse results
+        run: |
+          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
+          COMMIT_HASH="$(git describe --tags --dirty)"
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware ${{ inputs.instance_type }} \
+          --backend gpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --name-suffix avx512 \
+          --walk-subdirs \
+          --throughput
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
+        with:
+          name: ${{ github.sha }}_pbs
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on downloaded artifact"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
+          echo "Sending results to Slab..."
+          curl -v -k \
+          -H "Content-Type: application/json" \
+          -H "X-Slab-Repository: ${{ github.repository }}" \
+          -H "X-Slab-Command: store_data_v2" \
+          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+          -d @${{ env.RESULTS_FILENAME }} \
+          ${{ secrets.SLAB_URL }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+          SLACK_MESSAGE: "PBS GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
+          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/shortint_benchmark.yml
+++ b/.github/workflows/shortint_benchmark.yml
@@ -24,6 +24,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-shortint-benchmarks:
@@ -53,10 +54,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -88,7 +88,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_shortint
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/shortint_full_benchmark.yml
+++ b/.github/workflows/shortint_full_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  shortint-benchmarks:
@@ -67,10 +68,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -112,7 +112,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_benchmark.yml
+++ b/.github/workflows/signed_integer_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_full_benchmark.yml
+++ b/.github/workflows/signed_integer_full_benchmark.yml
@@ -28,6 +28,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  integer-benchmarks:
@@ -35,12 +36,12 @@ jobs:
    runs-on: ${{ github.event.inputs.runner_name }}
    if: ${{ !cancelled() }}
    continue-on-error: true
+    timeout-minutes: 1440  # 24 hours
    strategy:
      max-parallel: 1
      matrix:
        command: [ integer, integer_multi_bit ]
-        op_flavor: [ default, default_comp, default_scalar, default_scalar_comp,
-                     unchecked, unchecked_comp, unchecked_scalar, unchecked_scalar_comp ]
+        op_flavor: [ default, unchecked ]
    steps:
      - name: Instance configuration used
        run: |
@@ -66,10 +67,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Checkout Slab repo
        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -96,7 +96,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/signed_integer_multi_bit_benchmark.yml
+++ b/.github/workflows/signed_integer_multi_bit_benchmark.yml
@@ -25,6 +25,7 @@ env:
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-integer-benchmarks:
@@ -54,10 +55,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run multi-bit benchmarks with AVX512
        run: |
@@ -69,7 +69,7 @@ jobs:
            parse_integer_benches

      - name: Upload csv results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_csv_integer
          path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +90,7 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_integer
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/start_benchmarks.yml
+++ b/.github/workflows/start_benchmarks.yml
@@ -36,6 +36,10 @@ on:
        description: "Run PBS benches"
        type: boolean
        default: true
+      pbs_gpu_bench:
+        description: "Run PBS benches on GPU"
+        type: boolean
+        default: true
      wasm_client_bench:
        description: "Run WASM client benches"
        type: boolean
@@ -49,7 +53,8 @@ jobs:
        command: [ boolean_bench, shortint_bench,
                   integer_bench, integer_multi_bit_bench,
                   signed_integer_bench, signed_integer_multi_bit_bench,
-                   pbs_bench, wasm_client_bench ]
+                   integer_gpu_bench, integer_multi_bit_gpu_bench,
+                   pbs_bench, pbs_gpu_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
@@ -59,7 +64,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@1c938490c880156b746568a518594309cfb3f66b
+        uses: tj-actions/changed-files@90a06d6ba9543371ab4df8eeca0be07ca6054959
        with:
          files_yaml: |
            common_benches:
--- a/.github/workflows/start_full_benchmarks.yml
+++ b/.github/workflows/start_full_benchmarks.yml
@@ -24,8 +24,9 @@ jobs:
    if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
    strategy:
      matrix:
-        command: [ boolean_bench, shortint_full_bench, integer_full_bench,
-                   signed_integer_full_bench, pbs_bench, wasm_client_bench ]
+        command: [ boolean_bench, shortint_full_bench,
+                   integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
+                   pbs_bench, pbs_gpu_bench, wasm_client_bench ]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -17,7 +17,7 @@ jobs:
        with:
          fetch-depth: 0
      - name: Save repo
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: repo-archive
          path: '.'
--- a/.github/workflows/trigger_aws_tests_on_pr.yml
+++ b/.github/workflows/trigger_aws_tests_on_pr.yml
@@ -29,6 +29,7 @@ jobs:
          allow-repeats: true
          message: |
            @slab-ci cpu_fast_test
+            @slab-ci gpu_test

      - name: Add approved label
        uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
--- a/.github/workflows/wasm_client_benchmark.yml
+++ b/.github/workflows/wasm_client_benchmark.yml
@@ -32,6 +32,7 @@ env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"

 jobs:
  run-wasm-client-benchmarks:
@@ -61,10 +62,9 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
        with:
          toolchain: nightly
-          override: true

      - name: Run benchmarks
        run: |
@@ -97,7 +97,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8
        with:
          name: ${{ github.sha }}_wasm
          path: ${{ env.RESULTS_FILENAME }}
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2023 ZAMA.
+Copyright © 2024 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/157
+++ b/157
@@ -17,6 +17,7 @@ FAST_TESTS?=FALSE
 FAST_BENCH?=FALSE
 BENCH_OP_FLAVOR?=DEFAULT
 NODE_VERSION=20
+FORWARD_COMPAT?=OFF
 # sed: -n, do not print input stream, -e means a script/expression
 # 1,/version/ indicates from the first line, to the line matching version at the start of the line
 # p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
@@ -49,10 +50,20 @@ else
 		COVERAGE_ONLY=
 endif

+ifeq ($(FORWARD_COMPAT),ON)
+		FORWARD_COMPAT_FEATURE=forward_compatibility
+else
+		FORWARD_COMPAT_FEATURE=
+endif
+
 # Variables used only for regex_engine example
 REGEX_STRING?=''
 REGEX_PATTERN?=''

+# tfhe-cuda-backend
+TFHECUDA_SRC="backends/tfhe-cuda-backend/cuda"
+TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
+
 # Exclude these files from coverage reports
 define COVERAGE_EXCLUDED_FILES
 --exclude-files apps/trivium/src/trivium/* \
@@ -137,10 +148,21 @@ check_linelint_installed:
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt

+.PHONY: fmt_gpu # Format rust and cuda code
+fmt_gpu: install_rs_check_toolchain
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
+	cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh
+
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check

+.PHONY: clippy_gpu # Run clippy lints on the gpu backend
+clippy_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -157,6 +179,12 @@ clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=$(TARGET_ARCH_FEATURE),experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
@@ -207,7 +235,7 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,safe-deserialization \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
@@ -262,17 +290,32 @@ build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets

+.PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
+symlink_c_libs_without_fingerprint:
+	@./scripts/symlink_c_libs_without_fingerprint.sh \
+		--cargo-profile "$(CARGO_PROFILE)" \
+		--lib-name tfhe-c-api-dynamic-buffer
+
 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,safe-deserialization \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint
+
+.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
+build_c_api_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,gpu \
+		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,safe-deserialization,experimental-force_fft_algo_dif4 \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
 		-p $(TFHE_SPEC)
+	@"$(MAKE)" symlink_c_libs_without_fingerprint

 .PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_rs_build_toolchain install_wasm_pack
@@ -326,6 +369,23 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 			-p $(TFHE_SPEC) -- core_crypto::; \
 	fi

+.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
+test_gpu: test_core_crypto_gpu test_integer_gpu
+
+.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
+test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+
+.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
+test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
+
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -342,7 +402,7 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,safe-deserialization \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
 		-p $(TFHE_SPEC) \
 		c_api

@@ -353,19 +413,23 @@ test_c_api_c: build_c_api
 .PHONY: test_c_api # Run all the tests for the C API
 test_c_api: test_c_api_rs test_c_api_c

+.PHONY: test_c_api_gpu # Run the C tests for the C API
+test_c_api_gpu: build_c_api_gpu
+	./scripts/c_api_tests.sh --gpu
+
 .PHONY: test_shortint_ci # Run the tests for shortint ci
 test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)"
+		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
 test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_shortint # Run all the tests for shortint
 test_shortint: install_rs_build_toolchain
@@ -385,7 +449,8 @@ test_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)"
+		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_unsigned_integer_ci # Run the tests for unsigned integer ci
 test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -393,7 +458,7 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only
+		--unsigned-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_signed_integer_ci # Run the tests for signed integer ci
 test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -401,14 +466,15 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only
+		--signed-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_integer_multi_bit_ci # Run the tests for integer ci running only multibit tests
 test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)"
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
 test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -416,7 +482,7 @@ test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nex
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only
+		--unsigned-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_signed_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
 test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -424,12 +490,12 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only
+		--signed-only --tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_safe_deserialization # Run the tests for safe deserialization
 test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,safe-deserialization -p $(TFHE_SPEC) -- safe_deserialization::
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_deserialization::

 .PHONY: test_integer # Run all the tests for integer
 test_integer: install_rs_build_toolchain
@@ -448,6 +514,12 @@ test_user_doc: install_rs_build_toolchain
 		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
 		-- test_user_docs::

+.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
+test_user_doc_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		-- test_user_docs::
+
 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -482,7 +554,7 @@ test_concrete_csprng:
 doc: install_rs_check_toolchain
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps -p $(TFHE_SPEC)

 .PHONY: docs # Build rust doc alias for doc
 docs: doc
@@ -491,7 +563,7 @@ docs: doc
 lint_doc: install_rs_check_toolchain
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -508,7 +580,7 @@ format_doc_latex:
 .PHONY: check_compile_tests # Build tests in debug without running them
 check_compile_tests:
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,safe-deserialization \
+		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC)

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
@@ -570,6 +642,20 @@ bench_integer: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

+.PHONY: bench_signed_integer # Run benchmarks for signed integer
+bench_signed_integer: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-signed-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
+
+.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
+bench_integer_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
+
 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -578,13 +664,6 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	--bench integer-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

-.PHONY: bench_signed_integer # Run benchmarks for signed integer
-bench_signed_integer: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
-
 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -593,6 +672,14 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	--bench integer-signed-bench \
 	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --

+.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
+bench_integer_multi_bit_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
+	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
+
 .PHONY: bench_shortint # Run benchmarks for shortint
 bench_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
@@ -600,6 +687,19 @@ bench_shortint: install_rs_check_toolchain
 	--bench shortint-bench \
 	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)

+.PHONY: bench_oprf # Run benchmarks for shortint
+bench_oprf: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench oprf-shortint-bench \
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench oprf-integer-bench \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
+
+
+
 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -621,6 +721,12 @@ bench_pbs: install_rs_check_toolchain
 	--bench pbs-bench \
 	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)

+.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
+bench_pbs_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs-bench \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
+
 .PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
 bench_web_js_api_parallel: build_web_js_api_parallel
 	$(MAKE) -C tfhe/web_wasm_parallel_tests bench
@@ -708,9 +814,12 @@ sha256_bool: install_rs_check_toolchain
 	--example sha256_bool \
 	--features=$(TARGET_ARCH_FEATURE),boolean

-.PHONY: pcc # pcc stands for pre commit checks
+.PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests

+.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
+pcc_gpu: pcc clippy_gpu
+
 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests

--- a/README.md
+++ b/README.md
@@ -4,13 +4,17 @@
 </p>
 <hr/>
 <p align="center">
-  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a>
+  <a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources</a>
 </p>
 <p align="center">
 <!-- Version badge using shields.io -->
  <a href="https://github.com/zama-ai/tfhe-rs/releases">
    <img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
  </a>
+  <!-- Link to tutorials badge using shields.io -->
+  <a href="#license">
+    <img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-orange?style=flat-square">
+  </a>
 <!-- Zama Bounty Program -->
  <a href="https://github.com/zama-ai/bounty-program">
    <img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
@@ -57,7 +61,7 @@ running Windows:
 tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
 ```

-Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs
+Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.


 ## A simple example
@@ -118,7 +122,7 @@ To run this code, use the following command:
 <p align="center"> <code> cargo run --release </code> </p>

 Note that when running code that uses `tfhe-rs`, it is highly recommended
-to run in release mode with cargo's `--release` flag to have the best performances possible,
+to run in release mode with cargo's `--release` flag to have the best performances possible.


 ## Contributing
@@ -138,9 +142,11 @@ libraries.

 ## Need support?
 <a target="_blank" href="https://community.zama.ai">
-  <img src="https://user-images.githubusercontent.com/5758427/231115030-21195b55-2629-4c01-9809-be5059243999.png">
+  <img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/33d856dc-f25d-454b-a010-af12bff2aa7d">
 </a>

+
+
 ## Citing TFHE-rs

 To cite TFHE-rs in academic papers, please use the following entry:
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "tfhe-cuda-backend"
+version = "0.1.3"
+edition = "2021"
+authors = ["Zama team"]
+license = "BSD-3-Clause-Clear"
+description = "Cuda implementation of TFHE-rs primitives."
+homepage = "https://www.zama.ai/"
+documentation = "https://docs.zama.ai/tfhe-rs"
+repository = "https://github.com/zama-ai/tfhe-rs"
+readme = "README.md"
+keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
+
+[build-dependencies]
+cmake = { version = "0.1" }
+
+[dependencies]
+thiserror = "1.0"
--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -0,0 +1,28 @@
+BSD 3-Clause Clear License
+
+Copyright © 2024 ZAMA.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or other
+materials provided with the distribution.
+
+3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
+or promote products derived from this software without specific prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
+THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/backends/tfhe-cuda-backend/README.md
+++ b/backends/tfhe-cuda-backend/README.md
@@ -0,0 +1,53 @@
+# TFHE Cuda backend
+
+## Introduction
+
+The `tfhe-cuda-backend` holds the code for GPU acceleration of Zama's variant of TFHE.
+It implements CUDA/C++ functions to perform homomorphic operations on LWE ciphertexts.
+
+It provides functions to allocate memory on the GPU, to copy data back 
+and forth between the CPU and the GPU, to create and destroy Cuda streams, etc.:
+- `cuda_create_stream`, `cuda_destroy_stream`
+- `cuda_malloc`, `cuda_check_valid_malloc`
+- `cuda_memcpy_async_to_cpu`, `cuda_memcpy_async_to_gpu`
+- `cuda_get_number_of_gpus`
+- `cuda_synchronize_device`
+The cryptographic operations it provides are:
+- an amortized implementation of the TFHE programmable bootstrap: `cuda_bootstrap_amortized_lwe_ciphertext_vector_32` and `cuda_bootstrap_amortized_lwe_ciphertext_vector_64`
+- a low latency implementation of the TFHE programmable bootstrap: `cuda_bootstrap_low latency_lwe_ciphertext_vector_32` and `cuda_bootstrap_low_latency_lwe_ciphertext_vector_64`
+- the keyswitch: `cuda_keyswitch_lwe_ciphertext_vector_32` and `cuda_keyswitch_lwe_ciphertext_vector_64`
+- the larger precision programmable bootstrap (wop PBS, which supports up to 16 bits of message while the classical PBS only supports up to 8 bits of message) and its sub-components: `cuda_wop_pbs_64`, `cuda_extract_bits_64`, `cuda_circuit_bootstrap_64`, `cuda_cmux_tree_64`, `cuda_blind_rotation_sample_extraction_64`
+- acceleration for leveled operations: `cuda_negate_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_plaintext_vector_64`, `cuda_mult_lwe_ciphertext_vector_cleartext_vector`.
+
+## Dependencies
+
+**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported. 
+
+- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation
+- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
+- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
+- [cmake](https://cmake.org/) >= 3.24
+
+## Build
+
+The Cuda project held in `tfhe-cuda-backend` can be compiled independently from Concrete in the 
+following way:
+```
+git clone git@github.com:zama-ai/tfhe-rs
+cd backends/tfhe-cuda-backend/cuda
+mkdir build
+cd build
+cmake ..
+make
+```
+The compute capability is detected automatically (with the first GPU information) and set accordingly.
+If your machine does not have an available Nvidia GPU, the compilation will work if you have the nvcc compiler installed. The generated executable will target a 7.0 compute capability (sm_70).
+
+## Links
+
+- [TFHE](https://eprint.iacr.org/2018/421.pdf)
+
+## License
+
+This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
+please contact us at `hello@zama.ai`.
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -0,0 +1,28 @@
+use std::env;
+use std::process::Command;
+
+fn main() {
+    println!("Build tfhe-cuda-backend");
+    if env::consts::OS == "linux" {
+        let output = Command::new("./get_os_name.sh").output().unwrap();
+        let distribution = String::from_utf8(output.stdout).unwrap();
+        if distribution != "Ubuntu\n" {
+            println!(
+                "cargo:warning=This Linux distribution is not officially supported. \
+                Only Ubuntu is supported by tfhe-cuda-backend at this time. Build may fail\n"
+            );
+        }
+        let dest = cmake::build("cuda");
+        println!("cargo:rustc-link-search=native={}", dest.display());
+        println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");
+        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        println!("cargo:rustc-link-lib=gomp");
+        println!("cargo:rustc-link-lib=cudart");
+        println!("cargo:rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+        println!("cargo:rustc-link-lib=stdc++");
+    } else {
+        panic!(
+            "Error: platform not supported, tfhe-cuda-backend not built (only Linux is supported)"
+        );
+    }
+}
--- a/backends/tfhe-cuda-backend/cuda/.cmake-format-config.py
+++ b/backends/tfhe-cuda-backend/cuda/.cmake-format-config.py
@@ -0,0 +1,10 @@
+# -----------------------------
+# Options effecting formatting.
+# -----------------------------
+with section("format"):
+
+  # How wide to allow formatted cmake files
+  line_width = 120
+  
+  # How many spaces to tab for indent
+  tab_size = 2
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -0,0 +1,90 @@
+cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
+project(tfhe_cuda_backend LANGUAGES CXX)
+
+# See if the minimum CUDA version is available. If not, only enable documentation building.
+set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
+include(CheckLanguage)
+# See if CUDA is available
+check_language(CUDA)
+# If so, enable CUDA to check the version.
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+endif()
+# If CUDA is not available, or the minimum version is too low do not build
+if(NOT CMAKE_CUDA_COMPILER)
+  message(FATAL_ERROR "Cuda compiler not found.")
+endif()
+
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
+  message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
+endif()
+# Get CUDA compute capability
+set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
+set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
+execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
+execute_process(
+  COMMAND ${OUTPUTFILE}
+  RESULT_VARIABLE CUDA_RETURN_CODE
+  OUTPUT_VARIABLE ARCH)
+file(REMOVE ${OUTPUTFILE})
+
+if(${CUDA_RETURN_CODE} EQUAL 0)
+  set(CUDA_SUCCESS "TRUE")
+else()
+  set(CUDA_SUCCESS "FALSE")
+endif()
+
+if(${CUDA_SUCCESS})
+  message(STATUS "CUDA Architecture: ${ARCH}")
+  message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
+  message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
+  message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
+  message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
+else()
+  message(WARNING ${ARCH})
+endif()
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# Add OpenMP support
+find_package(OpenMP REQUIRED)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -g")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
+if(${CUDA_SUCCESS})
+  set(CMAKE_CUDA_ARCHITECTURES native)
+else()
+  set(CMAKE_CUDA_ARCHITECTURES 70)
+endif()
+
+# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
+set(CMAKE_CUDA_FLAGS
+    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
+  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
+  --use_fast_math -Xcompiler -fPIC")
+
+set(INCLUDE_DIR include)
+
+add_subdirectory(src)
+target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
+
+# This is required for rust cargo build
+install(TARGETS tfhe_cuda_backend DESTINATION .)
+install(TARGETS tfhe_cuda_backend DESTINATION lib)
+
+# Define a function to add a lint target.
+find_file(CPPLINT NAMES cpplint cpplint.exe)
+if(CPPLINT)
+  # Add a custom target to lint all child projects. Dependencies are specified in child projects.
+  add_custom_target(all_lint)
+  # Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
+  set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  # set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
+endif()
+
+enable_testing()
--- a/backends/tfhe-cuda-backend/cuda/CPPLINT.cfg
+++ b/backends/tfhe-cuda-backend/cuda/CPPLINT.cfg
@@ -0,0 +1,3 @@
+set noparent 
+linelength=240
+filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17
--- a/backends/tfhe-cuda-backend/cuda/check_cuda.cu
+++ b/backends/tfhe-cuda-backend/cuda/check_cuda.cu
@@ -0,0 +1,22 @@
+#include <stdio.h>
+
+int main(int argc, char **argv) {
+  cudaDeviceProp dP;
+  float min_cc = 3.0;
+
+  int rc = cudaGetDeviceProperties(&dP, 0);
+  if (rc != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    printf("CUDA error: %s", cudaGetErrorString(error));
+    return rc; /* Failure */
+  }
+  if ((dP.major + (dP.minor / 10)) < min_cc) {
+    printf("Min Compute Capability of %2.1f required:  %d.%d found\n Not "
+           "Building CUDA Code",
+           min_cc, dP.major, dP.minor);
+    return 1; /* Failure */
+  } else {
+    printf("-arch=sm_%d%d", dP.major, dP.minor);
+    return 0; /* Success */
+  }
+}
--- a/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
+++ b/backends/tfhe-cuda-backend/cuda/format_tfhe_cuda_backend.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
+cmake-format -i CMakeLists.txt -c .cmake-format-config.py
+
+find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap.h
@@ -0,0 +1,118 @@
+#ifndef CUDA_BOOTSTRAP_H
+#define CUDA_BOOTSTRAP_H
+
+#include "device.h"
+#include <cstdint>
+
+enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
+
+extern "C" {
+void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
+                                 cuda_stream_t *stream,
+                                 uint32_t polynomial_size,
+                                 uint32_t total_polynomials);
+
+void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size);
+
+void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size);
+
+void scratch_cuda_bootstrap_amortized_32(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory);
+
+void scratch_cuda_bootstrap_amortized_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory);
+
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
+                                      int8_t **pbs_buffer);
+
+void scratch_cuda_bootstrap_low_latency_32(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void scratch_cuda_bootstrap_low_latency_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory);
+
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
+
+void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
+                                        int8_t **pbs_buffer);
+
+uint64_t get_buffer_size_bootstrap_amortized_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+
+uint64_t get_buffer_size_bootstrap_low_latency_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+}
+
+#ifdef __CUDACC__
+__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
+                                         int glwe_dimension,
+                                         uint32_t level_count);
+
+template <typename T>
+__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count);
+
+template <typename T>
+__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count);
+
+template <typename T>
+__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+
+#endif
+
+#endif // CUDA_BOOTSTRAP_H
--- a/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/bootstrap_multibit.h
@@ -0,0 +1,46 @@
+#ifndef CUDA_MULTI_BIT_H
+#define CUDA_MULTI_BIT_H
+
+#include <cstdint>
+
+extern "C" {
+void cuda_convert_lwe_multi_bit_bootstrap_key_64(
+    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
+    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
+    uint32_t grouping_factor);
+
+void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t chunk_size = 0);
+
+void scratch_cuda_multi_bit_pbs_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory,
+    uint32_t chunk_size = 0);
+
+void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
+}
+#ifdef __CUDACC__
+__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
+                                     uint32_t level_count,
+                                     uint32_t glwe_dimension,
+                                     uint32_t num_samples);
+
+__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
+                                             uint32_t level_count,
+                                             uint32_t glwe_dimension,
+                                             uint32_t ct_count);
+
+__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_input_lwe_ciphertext_count);
+#endif
+
+#endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -0,0 +1,18 @@
+#ifndef CUDA_CIPHERTEXT_H
+#define CUDA_CIPHERTEXT_H
+
+#include <cstdint>
+
+extern "C" {
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
+                                                  void *v_stream,
+                                                  uint32_t gpu_index,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension);
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
+                                                  void *v_stream,
+                                                  uint32_t gpu_index,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension);
+};
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -0,0 +1,88 @@
+#ifndef DEVICE_H
+#define DEVICE_H
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda_runtime.h>
+
+#define synchronize_threads_in_block() __syncthreads()
+
+extern "C" {
+
+struct cuda_stream_t {
+  cudaStream_t stream;
+  uint32_t gpu_index;
+
+  cuda_stream_t(uint32_t gpu_index) {
+    this->gpu_index = gpu_index;
+
+    cudaStreamCreate(&stream);
+  }
+
+  void release() {
+    cudaSetDevice(gpu_index);
+    cudaStreamDestroy(stream);
+  }
+
+  void synchronize() { cudaStreamSynchronize(stream); }
+};
+
+cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
+
+int cuda_destroy_stream(cuda_stream_t *stream);
+
+void *cuda_malloc(uint64_t size, uint32_t gpu_index);
+
+void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
+
+int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
+
+int cuda_check_support_cooperative_groups();
+
+int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
+
+int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+                             cuda_stream_t *stream);
+
+int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                                 cuda_stream_t *stream);
+
+int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
+
+int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                             cuda_stream_t *stream);
+
+int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+                      cuda_stream_t *stream);
+
+int cuda_get_number_of_gpus();
+
+int cuda_synchronize_device(uint32_t gpu_index);
+
+int cuda_drop(void *ptr, uint32_t gpu_index);
+
+int cuda_drop_async(void *ptr, cuda_stream_t *stream);
+
+int cuda_get_max_shared_memory(uint32_t gpu_index);
+
+int cuda_synchronize_stream(cuda_stream_t *stream);
+
+#define check_cuda_error(ans)                                                  \
+  { cuda_error((ans), __FILE__, __LINE__); }
+inline void cuda_error(cudaError_t code, const char *file, int line,
+                       bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort)
+      exit(code);
+  }
+}
+}
+
+template <typename Torus>
+void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
+                          Torus n);
+#endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_debug.cuh
@@ -0,0 +1,100 @@
+#include "cuComplex.h"
+#include "thrust/complex.h"
+#include <iostream>
+#include <string>
+#include <type_traits>
+
+#define PRINT_VARS
+#ifdef PRINT_VARS
+#define PRINT_DEBUG_5(var, begin, end, step, cond)                             \
+  _print_debug(var, #var, begin, end, step, cond, "", false)
+#define PRINT_DEBUG_6(var, begin, end, step, cond, text)                       \
+  _print_debug(var, #var, begin, end, step, cond, text, true)
+#define CAT(A, B) A##B
+#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
+#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
+#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
+#define PRINT_DEBUG(...)                                                       \
+  PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
+#else
+#define PRINT_DEBUG(...)
+#endif
+
+template <typename T>
+__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
+_print_debug(T *var, const char *var_name, int start, int end, int step,
+             bool cond, const char *text, bool has_text) {
+  __syncthreads();
+  if (cond) {
+    if (has_text)
+      printf("%s\n", text);
+    for (int i = start; i < end; i += step) {
+      printf("%s[%u]: %u\n", var_name, i, var[i]);
+    }
+  }
+  __syncthreads();
+}
+
+template <typename T>
+__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
+_print_debug(T *var, const char *var_name, int start, int end, int step,
+             bool cond, const char *text, bool has_text) {
+  __syncthreads();
+  if (cond) {
+    if (has_text)
+      printf("%s\n", text);
+    for (int i = start; i < end; i += step) {
+      printf("%s[%u]: %d\n", var_name, i, var[i]);
+    }
+  }
+  __syncthreads();
+}
+
+template <typename T>
+__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
+_print_debug(T *var, const char *var_name, int start, int end, int step,
+             bool cond, const char *text, bool has_text) {
+  __syncthreads();
+  if (cond) {
+    if (has_text)
+      printf("%s\n", text);
+    for (int i = start; i < end; i += step) {
+      printf("%s[%u]: %.15f\n", var_name, i, var[i]);
+    }
+  }
+  __syncthreads();
+}
+
+template <typename T>
+__device__
+    typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
+                            void>::type
+    _print_debug(T *var, const char *var_name, int start, int end, int step,
+                 bool cond, const char *text, bool has_text) {
+  __syncthreads();
+  if (cond) {
+    if (has_text)
+      printf("%s\n", text);
+    for (int i = start; i < end; i += step) {
+      printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
+             var[i].imag());
+    }
+  }
+  __syncthreads();
+}
+
+template <typename T>
+__device__
+    typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
+    _print_debug(T *var, const char *var_name, int start, int end, int step,
+                 bool cond, const char *text, bool has_text) {
+  __syncthreads();
+  if (cond) {
+    if (has_text)
+      printf("%s\n", text);
+    for (int i = start; i < end; i += step) {
+      printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
+    }
+  }
+  __syncthreads();
+}
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -0,0 +1,21 @@
+#ifndef CNCRT_KS_H_
+#define CNCRT_KS_H_
+
+#include <cstdint>
+
+extern "C" {
+
+void cuda_keyswitch_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
+    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples);
+
+void cuda_keyswitch_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
+    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples);
+}
+
+#endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -0,0 +1,50 @@
+#ifndef CUDA_LINALG_H_
+#define CUDA_LINALG_H_
+
+#include "bootstrap.h"
+#include <cstdint>
+#include <device.h>
+
+extern "C" {
+
+void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count);
+void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
+void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *cleartext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
+void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *cleartext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count);
+}
+
+#endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(SOURCES
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
+file(GLOB_RECURSE SOURCES "*.cu")
+add_library(tfhe_cuda_backend STATIC ${SOURCES})
+set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
+target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -0,0 +1 @@
+#include "ciphertext.cuh"
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -0,0 +1,44 @@
+#ifndef CUDA_CIPHERTEXT_CUH
+#define CUDA_CIPHERTEXT_CUH
+
+#include "ciphertext.h"
+#include "device.h"
+#include <cstdint>
+
+template <typename T>
+void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src,
+                                               cuda_stream_t *stream,
+                                               uint32_t number_of_cts,
+                                               uint32_t lwe_dimension) {
+  cudaSetDevice(stream->gpu_index);
+  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
+  cuda_memcpy_async_to_gpu(dest, src, size, stream);
+}
+
+void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
+                                                  cuda_stream_t *stream,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension) {
+  cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
+      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
+}
+
+template <typename T>
+void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src,
+                                               cuda_stream_t *stream,
+                                               uint32_t number_of_cts,
+                                               uint32_t lwe_dimension) {
+  cudaSetDevice(stream->gpu_index);
+  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
+  cuda_memcpy_async_to_cpu(dest, src, size, stream);
+}
+
+void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
+                                                  cuda_stream_t *stream,
+                                                  uint32_t number_of_cts,
+                                                  uint32_t lwe_dimension) {
+  cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
+      (uint64_t *)dest, (uint64_t *)src, stream, number_of_cts, lwe_dimension);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -0,0 +1,162 @@
+#ifndef CNCRT_CRYPTO_CUH
+#define CNCRT_CRPYTO_CUH
+
+#include "device.h"
+#include <cstdint>
+
+/**
+ * GadgetMatrix implements the iterator design pattern to decompose a set of
+ * num_poly consecutive polynomials with degree params::degree. A total of
+ * level_count levels is expected and each call to decompose_and_compress_next()
+ * writes to the result the next level. It is also possible to advance an
+ * arbitrary amount of levels by using decompose_and_compress_level().
+ *
+ * This class always decomposes the entire set of num_poly polynomials.
+ * By default, it works on a single polynomial.
+ */
+#pragma once
+template <typename T, class params> class GadgetMatrix {
+private:
+  uint32_t level_count;
+  uint32_t base_log;
+  uint32_t mask;
+  uint32_t halfbg;
+  uint32_t num_poly;
+  T offset;
+  int current_level;
+  T mask_mod_b;
+  T *state;
+
+public:
+  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
+                          uint32_t num_poly = 1)
+      : base_log(base_log), level_count(level_count), num_poly(num_poly),
+        state(state) {
+
+    mask_mod_b = (1ll << base_log) - 1ll;
+    current_level = level_count;
+    int tid = threadIdx.x;
+    for (int i = 0; i < num_poly * params::opt; i++) {
+      state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
+  // Decomposes all polynomials at once
+  __device__ void decompose_and_compress_next(double2 *result) {
+    for (int j = 0; j < num_poly; j++) {
+      auto result_slice = result + j * params::degree / 2;
+      decompose_and_compress_next_polynomial(result_slice, j);
+    }
+  }
+
+  // Decomposes a single polynomial
+  __device__ void decompose_and_compress_next_polynomial(double2 *result,
+                                                         int j) {
+    if (j == 0)
+      current_level -= 1;
+
+    int tid = threadIdx.x;
+    auto state_slice = state + j * params::degree;
+    for (int i = 0; i < params::opt / 2; i++) {
+      T res_re = state_slice[tid] & mask_mod_b;
+      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
+      state_slice[tid] >>= base_log;
+      state_slice[tid + params::degree / 2] >>= base_log;
+      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
+      T carry_im =
+          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
+      carry_re >>= (base_log - 1);
+      carry_im >>= (base_log - 1);
+      state_slice[tid] += carry_re;
+      state_slice[tid + params::degree / 2] += carry_im;
+      res_re -= carry_re << base_log;
+      res_im -= carry_im << base_log;
+
+      result[tid].x = (int32_t)res_re;
+      result[tid].y = (int32_t)res_im;
+
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
+  // Decomposes a single polynomial
+  __device__ void
+  decompose_and_compress_next_polynomial_elements(double2 *result, int j) {
+    if (j == 0)
+      current_level -= 1;
+
+    int tid = threadIdx.x;
+    auto state_slice = state + j * params::degree;
+    for (int i = 0; i < params::opt / 2; i++) {
+      T res_re = state_slice[tid] & mask_mod_b;
+      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
+      state_slice[tid] >>= base_log;
+      state_slice[tid + params::degree / 2] >>= base_log;
+      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
+      T carry_im =
+          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
+      carry_re >>= (base_log - 1);
+      carry_im >>= (base_log - 1);
+      state_slice[tid] += carry_re;
+      state_slice[tid + params::degree / 2] += carry_im;
+      res_re -= carry_re << base_log;
+      res_im -= carry_im << base_log;
+
+      result[i].x = (int32_t)res_re;
+      result[i].y = (int32_t)res_im;
+
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
+  __device__ void decompose_and_compress_level(double2 *result, int level) {
+    for (int i = 0; i < level_count - level; i++)
+      decompose_and_compress_next(result);
+  }
+};
+
+template <typename T> class GadgetMatrixSingle {
+private:
+  uint32_t level_count;
+  uint32_t base_log;
+  uint32_t mask;
+  uint32_t halfbg;
+  T offset;
+
+public:
+  __device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
+      : base_log(base_log), level_count(level_count) {
+    uint32_t bg = 1 << base_log;
+    this->halfbg = bg / 2;
+    this->mask = bg - 1;
+    T temp = 0;
+    for (int i = 0; i < this->level_count; i++) {
+      temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
+    }
+    this->offset = temp * this->halfbg;
+  }
+
+  __device__ T decompose_one_level_single(T element, uint32_t level) {
+    T s = element + this->offset;
+    uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
+    T temp1 = (s >> decal) & this->mask;
+    return (T)(temp1 - this->halfbg);
+  }
+};
+
+template <typename Torus>
+__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
+  Torus res = state & mask_mod_b;
+  state >>= base_log;
+  Torus carry = ((res - 1ll) | state) & res;
+  carry >>= base_log - 1;
+  state += carry;
+  res -= carry << base_log;
+  return res;
+}
+
+#endif // CNCRT_CRPYTO_H
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -0,0 +1,74 @@
+#ifndef CNCRT_GGSW_CUH
+#define CNCRT_GGSW_CUH
+
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "polynomial/parameters.cuh"
+
+template <typename T, typename ST, class params, sharedMemDegree SMD>
+__global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
+                                             int8_t *device_mem) {
+
+  extern __shared__ int8_t sharedmem[];
+  double2 *selected_memory;
+
+  if constexpr (SMD == FULLSM)
+    selected_memory = (double2 *)sharedmem;
+  else
+    selected_memory = (double2 *)device_mem[blockIdx.x * params::degree];
+
+  // Compression
+  int offset = blockIdx.x * blockDim.x;
+
+  int tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    ST x = src[(tid) + params::opt * offset];
+    ST y = src[(tid + params::degree / 2) + params::opt * offset];
+    selected_memory[tid].x = x / (double)std::numeric_limits<T>::max();
+    selected_memory[tid].y = y / (double)std::numeric_limits<T>::max();
+    tid += params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+
+  // Switch to the FFT space
+  NSMFFT_direct<HalfDegree<params>>(selected_memory);
+  synchronize_threads_in_block();
+
+  // Write the output to global memory
+  tid = threadIdx.x;
+#pragma unroll
+  for (int j = 0; j < params::opt / 2; j++) {
+    dest[tid + (params::opt >> 1) * offset] = selected_memory[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
+/**
+ * Applies the FFT transform on sequence of GGSW ciphertexts already in the
+ * global memory
+ */
+template <typename T, typename ST, class params>
+void batch_fft_ggsw_vector(cuda_stream_t *stream, double2 *dest, T *src,
+                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
+                           uint32_t polynomial_size, uint32_t level_count,
+                           uint32_t gpu_index, uint32_t max_shared_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  int shared_memory_size = sizeof(double) * polynomial_size;
+
+  int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  int blockSize = polynomial_size / params::opt;
+
+  if (max_shared_memory < shared_memory_size) {
+    device_batch_fft_ggsw_vector<T, ST, params, NOSM>
+        <<<gridSize, blockSize, 0, stream->stream>>>(dest, src, d_mem);
+  } else {
+    device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
+        <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(dest, src,
+                                                                      d_mem);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif // CNCRT_GGSW_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -0,0 +1,48 @@
+#include "keyswitch.cuh"
+#include "keyswitch.h"
+#include <cstdint>
+
+/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
+ * Head out to the equivalent operation on 64 bits for more details.
+ */
+void cuda_keyswitch_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
+    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples) {
+  cuda_keyswitch_lwe_ciphertext_vector(
+      stream, static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_output_indexes),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+}
+
+/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
+ * (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
+ * lwe_dimension_in mask values + 1 body value
+ *  - ksk: the keyswitch key to be used in the operation
+ *  - base log: the log of the base used in the decomposition (should be the one
+ * used to create the ksk)
+ *
+ * This function calls a wrapper to a device kernel that performs the keyswitch
+ * 	- num_samples blocks of threads are launched
+ */
+void cuda_keyswitch_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lwe_array_in, void *lwe_input_indexes, void *ksk,
+    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples) {
+  cuda_keyswitch_lwe_ciphertext_vector(
+      stream, static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_output_indexes),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -0,0 +1,144 @@
+#ifndef CNCRT_KS_CUH
+#define CNCRT_KS_CUH
+
+#include "device.h"
+#include "gadget.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "torus.cuh"
+#include <thread>
+#include <vector>
+
+template <typename Torus>
+__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
+                                uint32_t lwe_dimension_out,
+                                uint32_t level_count) {
+  int pos = i * level_count * (lwe_dimension_out + 1) +
+            level * (lwe_dimension_out + 1);
+  Torus *ptr = &ksk[pos];
+  return ptr;
+}
+
+/*
+ * keyswitch kernel
+ * Each thread handles a piece of the following equation:
+ * $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
+ * (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
+ * the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
+ * equation is solved for each polynomial coefficient. where Dec denotes the
+ * decomposition with base beta and l levels and the inner product is done
+ * between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
+ * with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
+ * scaling factor) under key s2 instead of s1, with an increased noise
+ *
+ */
+template <typename Torus>
+__global__ void
+keyswitch(Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lwe_array_in,
+          Torus *lwe_input_indexes, Torus *ksk, uint32_t lwe_dimension_in,
+          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
+          int lwe_lower, int lwe_upper, int cutoff) {
+  int tid = threadIdx.x;
+
+  extern __shared__ int8_t sharedmem[];
+
+  Torus *local_lwe_array_out = (Torus *)sharedmem;
+
+  auto block_lwe_array_in = get_chunk(
+      lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
+  auto block_lwe_array_out = get_chunk(
+      lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
+
+  auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
+
+  int lwe_part_per_thd;
+  if (tid < cutoff) {
+    lwe_part_per_thd = lwe_upper;
+  } else {
+    lwe_part_per_thd = lwe_lower;
+  }
+  __syncthreads();
+
+  for (int k = 0; k < lwe_part_per_thd; k++) {
+    int idx = tid + k * blockDim.x;
+    local_lwe_array_out[idx] = 0;
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    local_lwe_array_out[lwe_dimension_out] =
+        block_lwe_array_in[lwe_dimension_in];
+  }
+
+  for (int i = 0; i < lwe_dimension_in; i++) {
+
+    __syncthreads();
+
+    Torus a_i =
+        round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
+
+    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+    Torus mask_mod_b = (1ll << base_log) - 1ll;
+
+    for (int j = 0; j < level_count; j++) {
+      auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
+      Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
+      for (int k = 0; k < lwe_part_per_thd; k++) {
+        int idx = tid + k * blockDim.x;
+        local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
+      }
+    }
+  }
+
+  for (int k = 0; k < lwe_part_per_thd; k++) {
+    int idx = tid + k * blockDim.x;
+    block_lwe_array_out[idx] = local_lwe_array_out[idx];
+  }
+}
+
+/// assume lwe_array_in in the gpu
+template <typename Torus>
+__host__ void cuda_keyswitch_lwe_ciphertext_vector(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *ksk,
+    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples) {
+
+  cudaSetDevice(stream->gpu_index);
+  constexpr int ideal_threads = 128;
+
+  int lwe_dim = lwe_dimension_out + 1;
+  int lwe_lower, lwe_upper, cutoff;
+  if (lwe_dim % ideal_threads == 0) {
+    lwe_lower = lwe_dim / ideal_threads;
+    lwe_upper = lwe_dim / ideal_threads;
+    cutoff = 0;
+  } else {
+    int y =
+        ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
+    cutoff = ideal_threads - y;
+    lwe_lower = lwe_dim / ideal_threads;
+    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
+  }
+
+  int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
+
+  int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
+
+  cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
+  check_cuda_error(cudaGetLastError());
+
+  dim3 grid(num_samples, 1, 1);
+  dim3 threads(ideal_threads, 1, 1);
+
+  //    cudaFuncSetAttribute(keyswitch<Torus>,
+  //                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+  //                         shared_mem);
+
+  keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
+      lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
+      lwe_upper, cutoff);
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -0,0 +1,74 @@
+#ifndef CNCRT_TORUS_CUH
+#define CNCRT_TORUS_CUH
+
+#include "types/int128.cuh"
+#include <limits>
+
+template <typename T>
+__device__ inline void typecast_double_to_torus(double x, T &r) {
+  r = T(x);
+}
+
+template <>
+__device__ inline void typecast_double_to_torus<uint32_t>(double x,
+                                                          uint32_t &r) {
+  r = __double2uint_rn(x);
+}
+
+template <>
+__device__ inline void typecast_double_to_torus<uint64_t>(double x,
+                                                          uint64_t &r) {
+  // The ull intrinsic does not behave in the same way on all architectures and
+  // on some platforms this causes the cmux tree test to fail
+  // Hence the intrinsic is not used here
+  uint128 nnnn = make_uint128_from_float(x);
+  uint64_t lll = nnnn.lo_;
+  r = lll;
+}
+
+template <typename T>
+__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
+                                              uint32_t level_count) {
+  T shift = sizeof(T) * 8 - level_count * base_log;
+  T mask = 1ll << (shift - 1);
+  T b = (x & mask) >> (shift - 1);
+  T res = x >> shift;
+  res += b;
+  res <<= shift;
+  return res;
+}
+
+template <typename T>
+__device__ __forceinline__ void rescale_torus_element(T element, T &output,
+                                                      uint32_t log_shift) {
+  output =
+      round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
+            (double)log_shift);
+}
+
+template <typename T>
+__device__ __forceinline__ T rescale_torus_element(T element,
+                                                   uint32_t log_shift) {
+  return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
+               (double)log_shift);
+}
+
+template <>
+__device__ __forceinline__ void
+rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
+                                uint32_t log_shift) {
+  output =
+      round(__uint2double_rn(element) /
+            (__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
+            __uint2double_rn(log_shift));
+}
+
+template <>
+__device__ __forceinline__ void
+rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
+                                uint32_t log_shift) {
+  output = round(__ull2double_rn(element) /
+                 (__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
+                 __uint2double_rn(log_shift));
+}
+#endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -0,0 +1,350 @@
+#include "device.h"
+#include <cstdint>
+#include <cuda_runtime.h>
+
+/// Unsafe function to create a CUDA stream, must check first that GPU exists
+cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
+  cudaSetDevice(gpu_index);
+  cuda_stream_t *stream = new cuda_stream_t(gpu_index);
+  return stream;
+}
+
+/// Unsafe function to destroy CUDA stream, must check first the GPU exists
+int cuda_destroy_stream(cuda_stream_t *stream) {
+  stream->release();
+  return 0;
+}
+
+/// Unsafe function that will try to allocate even if gpu_index is invalid
+/// or if there's not enough memory. A safe wrapper around it must call
+/// cuda_check_valid_malloc() first
+void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
+  cudaSetDevice(gpu_index);
+  void *ptr;
+  cudaMalloc((void **)&ptr, size);
+  check_cuda_error(cudaGetLastError());
+
+  return ptr;
+}
+
+/// Allocates a size-byte array at the device memory. Tries to do it
+/// asynchronously.
+void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
+  cudaSetDevice(stream->gpu_index);
+  void *ptr;
+
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11020)
+  int support_async_alloc;
+  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
+                                          cudaDevAttrMemoryPoolsSupported,
+                                          stream->gpu_index));
+
+  if (support_async_alloc) {
+    check_cuda_error(cudaMallocAsync((void **)&ptr, size, stream->stream));
+  } else {
+    check_cuda_error(cudaMalloc((void **)&ptr, size));
+  }
+#else
+  check_cuda_error(cudaMalloc((void **)&ptr, size));
+#endif
+  return ptr;
+}
+
+/// Checks that allocation is valid
+/// 0: valid
+/// -1: invalid, not enough memory in device
+/// -2: invalid, gpu index doesn't exist
+int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
+
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  size_t total_mem, free_mem;
+  cudaMemGetInfo(&free_mem, &total_mem);
+  if (size > free_mem) {
+    // error code: not enough memory
+    return -1;
+  }
+  return 0;
+}
+
+/// Returns
+///  -> 0 if Cooperative Groups is not supported.
+///  -> 1 otherwise
+int cuda_check_support_cooperative_groups() {
+  int cooperative_groups_supported = 0;
+  cudaDeviceGetAttribute(&cooperative_groups_supported,
+                         cudaDevAttrCooperativeLaunch, 0);
+
+  return cooperative_groups_supported > 0;
+}
+
+/// Tries to copy memory to the GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+                             cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dest);
+  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(
+      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
+  return 0;
+}
+
+/// Tries to copy memory to the GPU synchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dest);
+  if (attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
+  return 0;
+}
+
+/// Tries to copy memory to the CPU synchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, src);
+  if (attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
+  return 0;
+}
+
+/// Tries to copy memory within a GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+                                 cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr_dest;
+  cudaPointerGetAttributes(&attr_dest, dest);
+  if (attr_dest.device != stream->gpu_index &&
+      attr_dest.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  cudaPointerAttributes attr_src;
+  cudaPointerGetAttributes(&attr_src, src);
+  if (attr_src.device != stream->gpu_index &&
+      attr_src.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  if (attr_src.device != attr_dest.device) {
+    // error code: different devices
+    return -1;
+  }
+
+  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
+                                   stream->stream));
+  return 0;
+}
+
+/// Synchronizes device
+/// 0: success
+/// -2: error, gpu index doesn't exist
+int cuda_synchronize_device(uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  cudaDeviceSynchronize();
+  return 0;
+}
+
+int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+                      cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dest);
+  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
+  return 0;
+}
+
+template <typename Torus>
+__global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < n)
+    array[index] = value;
+}
+
+template <typename Torus>
+void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
+                          Torus n) {
+  int block_size = 256;
+  int num_blocks = (n + block_size - 1) / block_size;
+
+  // Launch the kernel
+  cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
+                                                                n);
+}
+
+/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
+template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
+                                   uint64_t value, uint64_t n);
+template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
+                                   uint32_t value, uint32_t n);
+
+/// Tries to copy memory to the GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+/// -3: error, zero copy size
+int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                             cuda_stream_t *stream) {
+  if (size == 0) {
+    // error code: zero copy size
+    return -3;
+  }
+
+  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, src);
+  if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+
+  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(
+      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
+  return 0;
+}
+
+/// Return number of GPUs available
+int cuda_get_number_of_gpus() {
+  int num_gpus;
+  cudaGetDeviceCount(&num_gpus);
+  return num_gpus;
+}
+
+/// Drop a cuda array
+int cuda_drop(void *ptr, uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaFree(ptr));
+  return 0;
+}
+
+/// Drop a cuda array. Tries to do it asynchronously
+int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
+
+  cudaSetDevice(stream->gpu_index);
+#ifndef CUDART_VERSION
+#error CUDART_VERSION Undefined!
+#elif (CUDART_VERSION >= 11020)
+  int support_async_alloc;
+  check_cuda_error(cudaDeviceGetAttribute(&support_async_alloc,
+                                          cudaDevAttrMemoryPoolsSupported,
+                                          stream->gpu_index));
+
+  if (support_async_alloc) {
+    check_cuda_error(cudaFreeAsync(ptr, stream->stream));
+  } else {
+    check_cuda_error(cudaFree(ptr));
+  }
+#else
+  check_cuda_error(cudaFree(ptr));
+#endif
+  return 0;
+}
+
+/// Get the maximum size for the shared memory
+int cuda_get_max_shared_memory(uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, gpu_index);
+  int max_shared_memory = 0;
+  if (prop.major >= 6) {
+    max_shared_memory = prop.sharedMemPerMultiprocessor;
+  } else {
+    max_shared_memory = prop.sharedMemPerBlock;
+  }
+  return max_shared_memory;
+}
+
+int cuda_synchronize_stream(cuda_stream_t *stream) {
+  stream->synchronize();
+  return 0;
+}
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -0,0 +1,725 @@
+#ifndef GPU_BOOTSTRAP_FFT_CUH
+#define GPU_BOOTSTRAP_FFT_CUH
+
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "twiddles.cuh"
+#include "types/complex/operations.cuh"
+
+/*
+ * Direct negacyclic FFT:
+ *   - before the FFT the N real coefficients are stored into a
+ *     N/2 sized complex with the even coefficients in the real part
+ *     and the odd coefficients in the imaginary part. This is referred to
+ *     as the half-size FFT
+ *   - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
+ *     opt is divided by 2 because the butterfly pattern is always applied
+ *     between pairs of coefficients
+ *   - instead of twisting each coefficient A_j before the FFT by
+ *     multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
+ *     the FFT is modified, and for each level k of the FFT the twiddle:
+ *     w_j,k = exp(-i pi j/2^k)
+ *     is replaced with:
+ *     \zeta_j,k = exp(-i pi (2j-1)/2^k)
+ */
+template <class params> __device__ void NSMFFT_direct(double2 *A) {
+
+  /* We don't make bit reverse here, since twiddles are already reversed
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance
+   */
+
+  size_t tid = threadIdx.x;
+  size_t twid_id;
+  size_t i1, i2;
+  double2 u, v, w;
+  // level 1
+  // we don't make actual complex multiplication on level1 since we have only
+  // one twiddle, it's real and image parts are equal, so we can multiply
+  // it with simpler operations
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    i1 = tid;
+    i2 = tid + params::degree / 2;
+
+    u = A[i1];
+    v = A[i2] * (double2){0.707106781186547461715008466854,
+                          0.707106781186547461715008466854};
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 2
+  // from this level there are more than one twiddles and none of them has equal
+  // real and imag parts, so complete complex multiplication is needed
+  // for each level params::degree / 2^level represents number of coefficients
+  // inside divided chunk of specific level
+  //
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 4);
+    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
+    i2 = i1 + params::degree / 4;
+
+    w = negtwiddles[twid_id + 2];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 3
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 8);
+    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
+    i2 = i1 + params::degree / 8;
+
+    w = negtwiddles[twid_id + 4];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 4
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 16);
+    i1 =
+        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
+    i2 = i1 + params::degree / 16;
+
+    w = negtwiddles[twid_id + 8];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 5
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 32);
+    i1 =
+        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
+    i2 = i1 + params::degree / 32;
+
+    w = negtwiddles[twid_id + 16];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 6
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 64);
+    i1 =
+        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
+    i2 = i1 + params::degree / 64;
+
+    w = negtwiddles[twid_id + 32];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 7
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 128);
+    i1 = 2 * (params::degree / 128) * twid_id +
+         (tid & (params::degree / 128 - 1));
+    i2 = i1 + params::degree / 128;
+
+    w = negtwiddles[twid_id + 64];
+    u = A[i1];
+    v = A[i2] * w;
+
+    A[i1] += v;
+    A[i2] = u - v;
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // from level 8, we need to check size of params degree, because we support
+  // minimum actual polynomial size = 256,  when compressed size is halfed and
+  // minimum supported compressed size is 128, so we always need first 7
+  // levels of butterfy operation, since butterfly levels are hardcoded
+  // we need to check if polynomial size is big enough to require specific level
+  // of butterfly.
+  if constexpr (params::degree >= 256) {
+    // level 8
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 256);
+      i1 = 2 * (params::degree / 256) * twid_id +
+           (tid & (params::degree / 256 - 1));
+      i2 = i1 + params::degree / 256;
+
+      w = negtwiddles[twid_id + 128];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 512) {
+    // level 9
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 512);
+      i1 = 2 * (params::degree / 512) * twid_id +
+           (tid & (params::degree / 512 - 1));
+      i2 = i1 + params::degree / 512;
+
+      w = negtwiddles[twid_id + 256];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 1024) {
+    // level 10
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 1024);
+      i1 = 2 * (params::degree / 1024) * twid_id +
+           (tid & (params::degree / 1024 - 1));
+      i2 = i1 + params::degree / 1024;
+
+      w = negtwiddles[twid_id + 512];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 2048) {
+    // level 11
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 2048);
+      i1 = 2 * (params::degree / 2048) * twid_id +
+           (tid & (params::degree / 2048 - 1));
+      i2 = i1 + params::degree / 2048;
+
+      w = negtwiddles[twid_id + 1024];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 4096) {
+    // level 12
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 4096);
+      i1 = 2 * (params::degree / 4096) * twid_id +
+           (tid & (params::degree / 4096 - 1));
+      i2 = i1 + params::degree / 4096;
+
+      w = negtwiddles[twid_id + 2048];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  // compressed size = 8192 is actual polynomial size = 16384.
+  // from this size, twiddles can't fit in constant memory,
+  // so from here, butterfly operation access device memory.
+  if constexpr (params::degree >= 8192) {
+    // level 13
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 8192);
+      i1 = 2 * (params::degree / 8192) * twid_id +
+           (tid & (params::degree / 8192 - 1));
+      i2 = i1 + params::degree / 8192;
+
+      w = negtwiddles13[twid_id];
+      u = A[i1];
+      v = A[i2] * w;
+
+      A[i1] += v;
+      A[i2] = u - v;
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+}
+
+/*
+ * negacyclic inverse fft
+ */
+template <class params> __device__ void NSMFFT_inverse(double2 *A) {
+
+  /* We don't make bit reverse here, since twiddles are already reversed
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance
+   */
+
+  size_t tid = threadIdx.x;
+  size_t twid_id;
+  size_t i1, i2;
+  double2 u, w;
+
+  // divide input by compressed polynomial size
+  tid = threadIdx.x;
+  for (size_t i = 0; i < params::opt; ++i) {
+    A[tid] /= params::degree;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // none of the twiddles have equal real and imag part, so
+  // complete complex multiplication has to be done
+  // here we have more than one twiddle
+  // mapping in backward fft is reversed
+  // butterfly operation is started from last level
+
+  // compressed size = 8192 is actual polynomial size = 16384.
+  // twiddles for this size can't fit in constant memory so
+  // butterfly operation for this level acess device memory to fetch
+  // twiddles
+  if constexpr (params::degree >= 8192) {
+    // level 13
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 8192);
+      i1 = 2 * (params::degree / 8192) * twid_id +
+           (tid & (params::degree / 8192 - 1));
+      i2 = i1 + params::degree / 8192;
+
+      w = negtwiddles13[twid_id];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 4096) {
+    // level 12
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 4096);
+      i1 = 2 * (params::degree / 4096) * twid_id +
+           (tid & (params::degree / 4096 - 1));
+      i2 = i1 + params::degree / 4096;
+
+      w = negtwiddles[twid_id + 2048];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 2048) {
+    // level 11
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 2048);
+      i1 = 2 * (params::degree / 2048) * twid_id +
+           (tid & (params::degree / 2048 - 1));
+      i2 = i1 + params::degree / 2048;
+
+      w = negtwiddles[twid_id + 1024];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 1024) {
+    // level 10
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 1024);
+      i1 = 2 * (params::degree / 1024) * twid_id +
+           (tid & (params::degree / 1024 - 1));
+      i2 = i1 + params::degree / 1024;
+
+      w = negtwiddles[twid_id + 512];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 512) {
+    // level 9
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 512);
+      i1 = 2 * (params::degree / 512) * twid_id +
+           (tid & (params::degree / 512 - 1));
+      i2 = i1 + params::degree / 512;
+
+      w = negtwiddles[twid_id + 256];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::degree >= 256) {
+    // level 8
+    tid = threadIdx.x;
+#pragma unroll
+    for (size_t i = 0; i < params::opt / 2; ++i) {
+      twid_id = tid / (params::degree / 256);
+      i1 = 2 * (params::degree / 256) * twid_id +
+           (tid & (params::degree / 256 - 1));
+      i2 = i1 + params::degree / 256;
+
+      w = negtwiddles[twid_id + 128];
+      u = A[i1] - A[i2];
+
+      A[i1] += A[i2];
+      A[i2] = u * conjugate(w);
+
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  // below level 8, we don't need to check size of params degree, because we
+  // support minimum actual polynomial size = 256,  when compressed size is
+  // halfed and minimum supported compressed size is 128, so we always need
+  // last 7 levels of butterfy operation, since butterfly levels are hardcoded
+  // we don't need to check if polynomial size is big enough to require
+  // specific level of butterfly.
+  // level 7
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 128);
+    i1 = 2 * (params::degree / 128) * twid_id +
+         (tid & (params::degree / 128 - 1));
+    i2 = i1 + params::degree / 128;
+
+    w = negtwiddles[twid_id + 64];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 6
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 64);
+    i1 =
+        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
+    i2 = i1 + params::degree / 64;
+
+    w = negtwiddles[twid_id + 32];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 5
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 32);
+    i1 =
+        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
+    i2 = i1 + params::degree / 32;
+
+    w = negtwiddles[twid_id + 16];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 4
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 16);
+    i1 =
+        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
+    i2 = i1 + params::degree / 16;
+
+    w = negtwiddles[twid_id + 8];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 3
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 8);
+    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
+    i2 = i1 + params::degree / 8;
+
+    w = negtwiddles[twid_id + 4];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 2
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 4);
+    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
+    i2 = i1 + params::degree / 4;
+
+    w = negtwiddles[twid_id + 2];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // level 1
+  tid = threadIdx.x;
+#pragma unroll
+  for (size_t i = 0; i < params::opt / 2; ++i) {
+    twid_id = tid / (params::degree / 2);
+    i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
+    i2 = i1 + params::degree / 2;
+
+    w = negtwiddles[twid_id + 1];
+    u = A[i1] - A[i2];
+
+    A[i1] += A[i2];
+    A[i2] = u * conjugate(w);
+
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+}
+
+/*
+ * global batch fft
+ * does fft in half size
+ * unrolling half size fft result in half size + 1 elements
+ * this function must be called with actual degree
+ * function takes as input already compressed input
+ */
+template <class params, sharedMemDegree SMD>
+__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
+                             double2 *buffer) {
+  extern __shared__ double2 sharedMemoryFFT[];
+  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
+                               : sharedMemoryFFT;
+  int tid = threadIdx.x;
+
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
+    tid = tid + params::degree / params::opt;
+  }
+  __syncthreads();
+  NSMFFT_direct<HalfDegree<params>>(fft);
+  __syncthreads();
+
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
+    tid = tid + params::degree / params::opt;
+  }
+}
+
+/*
+ * global batch polynomial multiplication
+ * only used for fft tests
+ * d_input1 and d_output must not have the same pointer
+ * d_input1 can be modified inside the function
+ */
+template <class params, sharedMemDegree SMD>
+__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
+                                     double2 *d_output, double2 *buffer) {
+  extern __shared__ double2 sharedMemoryFFT[];
+  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
+                               : sharedMemoryFFT;
+
+  // Move first polynomial into shared memory(if possible otherwise it will
+  // be moved in device buffer)
+  int tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid];
+    tid = tid + params::degree / params::opt;
+  }
+
+  // Perform direct negacyclic fourier transform
+  __syncthreads();
+  NSMFFT_direct<HalfDegree<params>>(fft);
+  __syncthreads();
+
+  // Put the result of direct fft inside input1
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
+    tid = tid + params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // Move first polynomial into shared memory(if possible otherwise it will
+  // be moved in device buffer)
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid];
+    tid = tid + params::degree / params::opt;
+  }
+
+  // Perform direct negacyclic fourier transform on the second polynomial
+  __syncthreads();
+  NSMFFT_direct<HalfDegree<params>>(fft);
+  __syncthreads();
+
+  // calculate pointwise multiplication inside fft buffer
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid];
+    tid = tid + params::degree / params::opt;
+  }
+
+  // Perform backward negacyclic fourier transform
+  __syncthreads();
+  NSMFFT_inverse<HalfDegree<params>>(fft);
+  __syncthreads();
+
+  // copy results in output buffer
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
+    tid = tid + params::degree / params::opt;
+  }
+}
+
+#endif // GPU_BOOTSTRAP_FFT_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu
--- a/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cuh
@@ -0,0 +1,13 @@
+#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
+#define GPU_BOOTSTRAP_TWIDDLES_CUH
+
+/*
+ * 'negtwiddles' are stored in constant memory for faster access times
+ * because of it's limitied size, only twiddles for up to 2^12 polynomial size
+ * can be stored there, twiddles for 2^13 are stored in device memory
+ * 'negtwiddles13'
+ */
+
+extern __constant__ double2 negtwiddles[4096];
+extern __device__ double2 negtwiddles13[4096];
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -0,0 +1,51 @@
+#include "integer/bitwise_ops.cuh"
+
+void scratch_cuda_integer_radix_bitop_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, BITOP_TYPE op_type,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_radix_bitop_kb<uint64_t>(
+      stream, (int_bitop_buffer<uint64_t> **)mem_ptr, lwe_ciphertext_count,
+      params, op_type, allocate_gpu_memory);
+}
+
+void cuda_bitop_integer_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
+    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t lwe_ciphertext_count) {
+
+  host_integer_radix_bitop_kb<uint64_t>(
+      stream, static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_1),
+      static_cast<uint64_t *>(lwe_array_2),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      lwe_ciphertext_count);
+}
+
+void cuda_bitnot_integer_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    int8_t *mem_ptr, void *bsk, void *ksk, uint32_t lwe_ciphertext_count) {
+
+  host_integer_radix_bitnot_kb<uint64_t>(
+      stream, static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      lwe_ciphertext_count);
+}
+
+void cleanup_cuda_integer_bitop(cuda_stream_t *stream, int8_t **mem_ptr_void) {
+
+  int_bitop_buffer<uint64_t> *mem_ptr =
+      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -0,0 +1,51 @@
+#ifndef CUDA_INTEGER_BITWISE_OPS_CUH
+#define CUDA_INTEGER_BITWISE_OPS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.cuh"
+#include "integer.h"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <omp.h>
+
+template <typename Torus>
+__host__ void
+host_integer_radix_bitop_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+                            Torus *lwe_array_1, Torus *lwe_array_2,
+                            int_bitop_buffer<Torus> *mem_ptr, void *bsk,
+                            Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto lut = mem_ptr->lut;
+
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      stream, lwe_array_out, lwe_array_1, lwe_array_2, bsk, ksk,
+      num_radix_blocks, lut);
+}
+
+template <typename Torus>
+__host__ void
+host_integer_radix_bitnot_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+                             Torus *lwe_array_in,
+                             int_bitop_buffer<Torus> *mem_ptr, void *bsk,
+                             Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto lut = mem_ptr->lut;
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, lwe_array_out, lwe_array_in, bsk, ksk, num_radix_blocks, lut);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_radix_bitop_kb(
+    cuda_stream_t *stream, int_bitop_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
+                                         allocate_gpu_memory);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -0,0 +1,45 @@
+#include "integer/cmux.cuh"
+
+void scratch_cuda_integer_radix_cmux_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  std::function<uint64_t(uint64_t)> predicate_lut_f =
+      [](uint64_t x) -> uint64_t { return x == 1; };
+
+  scratch_cuda_integer_radix_cmux_kb(
+      stream, (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
+      lwe_ciphertext_count, params, allocate_gpu_memory);
+}
+
+void cuda_cmux_integer_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_condition,
+    void *lwe_array_true, void *lwe_array_false, int8_t *mem_ptr, void *bsk,
+    void *ksk, uint32_t lwe_ciphertext_count) {
+
+  host_integer_radix_cmux_kb<uint64_t>(
+      stream, static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_condition),
+      static_cast<uint64_t *>(lwe_array_true),
+      static_cast<uint64_t *>(lwe_array_false),
+      (int_cmux_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+
+      lwe_ciphertext_count);
+}
+
+void cleanup_cuda_integer_radix_cmux(cuda_stream_t *stream,
+                                     int8_t **mem_ptr_void) {
+
+  int_cmux_buffer<uint64_t> *mem_ptr =
+      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -0,0 +1,100 @@
+#ifndef CUDA_INTEGER_CMUX_CUH
+#define CUDA_INTEGER_CMUX_CUH
+
+#include "integer.cuh"
+#include <omp.h>
+
+template <typename Torus>
+__host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
+                          Torus *lwe_array_input, Torus *lwe_condition,
+                          int_zero_out_if_buffer<Torus> *mem_ptr,
+                          int_radix_lut<Torus> *predicate, void *bsk,
+                          Torus *ksk, uint32_t num_radix_blocks) {
+  auto params = mem_ptr->params;
+
+  int big_lwe_size = params.big_lwe_dimension + 1;
+
+  // Left message is shifted
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = (params.big_lwe_dimension + 1);
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+
+  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
+  // second operand is fixed
+  auto tmp_lwe_array_input = mem_ptr->tmp;
+  for (int i = 0; i < num_radix_blocks; i++) {
+    auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
+    auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
+
+    device_pack_bivariate_blocks<<<num_blocks, num_threads, 0,
+                                   stream->stream>>>(
+        lwe_array_out_block, lwe_array_input_block, lwe_condition,
+        predicate->lwe_indexes, params.big_lwe_dimension,
+        params.message_modulus, 1);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, lwe_array_out, tmp_lwe_array_input, bsk, ksk, num_radix_blocks,
+      predicate);
+}
+
+template <typename Torus>
+__host__ void
+host_integer_radix_cmux_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+                           Torus *lwe_condition, Torus *lwe_array_true,
+                           Torus *lwe_array_false,
+                           int_cmux_buffer<Torus> *mem_ptr, void *bsk,
+                           Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto params = mem_ptr->params;
+
+  // Since our CPU threads will be working on different streams we shall assert
+  // the work in the main stream is completed
+  stream->synchronize();
+  auto true_stream = mem_ptr->zero_if_true_buffer->local_stream;
+  auto false_stream = mem_ptr->zero_if_false_buffer->local_stream;
+
+#pragma omp parallel sections
+  {
+    // Both sections may be executed in parallel
+#pragma omp section
+    {
+      auto mem_true = mem_ptr->zero_if_true_buffer;
+      zero_out_if(true_stream, mem_ptr->tmp_true_ct, lwe_array_true,
+                  lwe_condition, mem_true, mem_ptr->inverted_predicate_lut, bsk,
+                  ksk, num_radix_blocks);
+    }
+#pragma omp section
+    {
+      auto mem_false = mem_ptr->zero_if_false_buffer;
+      zero_out_if(false_stream, mem_ptr->tmp_false_ct, lwe_array_false,
+                  lwe_condition, mem_false, mem_ptr->predicate_lut, bsk, ksk,
+                  num_radix_blocks);
+    }
+  }
+  cuda_synchronize_stream(true_stream);
+  cuda_synchronize_stream(false_stream);
+
+  // If the condition was true, true_ct will have kept its value and false_ct
+  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
+  // have kept its value
+  auto added_cts = mem_ptr->tmp_true_ct;
+  host_addition(stream, added_cts, mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
+                params.big_lwe_dimension, num_radix_blocks);
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, lwe_array_out, added_cts, bsk, ksk, num_radix_blocks,
+      mem_ptr->message_extract_lut);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_radix_cmux_kb(
+    cuda_stream_t *stream, int_cmux_buffer<Torus> **mem_ptr,
+    std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
+    int_radix_params params, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
+                                        num_radix_blocks, allocate_gpu_memory);
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -0,0 +1,83 @@
+#include "integer/comparison.cuh"
+
+void scratch_cuda_integer_radix_comparison_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, COMPARISON_TYPE op_type,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  switch (op_type) {
+  case EQ:
+  case NE:
+    scratch_cuda_integer_radix_equality_check_kb<uint64_t>(
+        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
+        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
+    break;
+  case GT:
+  case GE:
+  case LT:
+  case LE:
+  case MAX:
+  case MIN:
+    scratch_cuda_integer_radix_difference_check_kb<uint64_t>(
+        stream, (int_comparison_buffer<uint64_t> **)mem_ptr,
+        lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
+    break;
+  }
+}
+
+void cuda_comparison_integer_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_1,
+    void *lwe_array_2, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t lwe_ciphertext_count) {
+
+  int_comparison_buffer<uint64_t> *buffer =
+      (int_comparison_buffer<uint64_t> *)mem_ptr;
+  switch (buffer->op) {
+  case EQ:
+  case NE:
+    host_integer_radix_equality_check_kb<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_array_1),
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
+        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
+    break;
+  case GT:
+  case GE:
+  case LT:
+  case LE:
+    host_integer_radix_difference_check_kb<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_array_1),
+        static_cast<uint64_t *>(lwe_array_2), buffer,
+        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
+        lwe_ciphertext_count);
+    break;
+  case MAX:
+  case MIN:
+    host_integer_radix_maxmin_kb<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_array_1),
+        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
+        static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
+    break;
+  default:
+    printf("Not implemented\n");
+  }
+}
+
+void cleanup_cuda_integer_comparison(cuda_stream_t *stream,
+                                     int8_t **mem_ptr_void) {
+
+  int_comparison_buffer<uint64_t> *mem_ptr =
+      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -0,0 +1,468 @@
+#ifndef CUDA_INTEGER_COMPARISON_OPS_CUH
+#define CUDA_INTEGER_COMPARISON_OPS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.cuh"
+#include "integer.h"
+#include "integer/cmux.cuh"
+#include "integer/negation.cuh"
+#include "integer/scalar_addition.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "types/complex/operations.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+// lwe_dimension + 1 threads
+// todo: This kernel MUST be refactored to a binary reduction
+template <typename Torus>
+__global__ void device_accumulate_all_blocks(Torus *output, Torus *input_block,
+                                             uint32_t lwe_dimension,
+                                             uint32_t num_blocks) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < lwe_dimension + 1) {
+    auto block = &input_block[idx];
+
+    Torus sum = block[0];
+    for (int i = 1; i < num_blocks; i++) {
+      sum += block[i * (lwe_dimension + 1)];
+    }
+
+    output[idx] = sum;
+  }
+}
+
+template <typename Torus>
+__host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
+                                    Torus *input, uint32_t lwe_dimension,
+                                    uint32_t num_radix_blocks) {
+
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = (lwe_dimension + 1);
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  // Add all blocks and store in sum
+  device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
+      output, input, lwe_dimension, num_radix_blocks);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__host__ void
+are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
+                               Torus *lwe_array_in,
+                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
+                               Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  auto are_all_block_true_buffer =
+      mem_ptr->eq_buffer->are_all_block_true_buffer;
+
+  uint32_t total_modulus = message_modulus * carry_modulus;
+  uint32_t max_value = total_modulus - 1;
+
+  cuda_memcpy_async_gpu_to_gpu(
+      lwe_array_out, lwe_array_in,
+      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
+
+  int lut_num_blocks = 0;
+  uint32_t remaining_blocks = num_radix_blocks;
+  while (remaining_blocks > 1) {
+    // Split in max_value chunks
+    uint32_t chunk_length = std::min(max_value, remaining_blocks);
+    int num_chunks = remaining_blocks / chunk_length;
+
+    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
+    // as in the worst case we will be adding `max_value` ones
+    auto input_blocks = lwe_array_out;
+    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
+    for (int i = 0; i < num_chunks; i++) {
+      accumulate_all_blocks(stream, accumulator, input_blocks,
+                            big_lwe_dimension, chunk_length);
+
+      accumulator += (big_lwe_dimension + 1);
+      remaining_blocks -= (chunk_length - 1);
+      input_blocks += (big_lwe_dimension + 1) * chunk_length;
+    }
+    accumulator = are_all_block_true_buffer->tmp_block_accumulated;
+
+    // Selects a LUT
+    int_radix_lut<Torus> *lut;
+    if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
+      // is_non_zero_lut_buffer LUT
+      lut = mem_ptr->eq_buffer->is_non_zero_lut;
+    } else if (chunk_length == max_value) {
+      // is_max_value LUT
+      lut = are_all_block_true_buffer->is_max_value_lut;
+    } else {
+      // is_equal_to_num_blocks LUT
+      lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
+      if (chunk_length != lut_num_blocks) {
+        auto is_equal_to_num_blocks_lut_f = [max_value,
+                                             chunk_length](Torus x) -> Torus {
+          return (x & max_value) == chunk_length;
+        };
+        generate_device_accumulator<Torus>(
+            stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
+            carry_modulus, is_equal_to_num_blocks_lut_f);
+
+        // We don't have to generate this lut again
+        lut_num_blocks = chunk_length;
+      }
+    }
+
+    // Applies the LUT
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
+  }
+}
+
+// This takes an input slice of blocks.
+//
+// Each block can encrypt any value as long as its < message_modulus.
+//
+// It will compare blocks with 0, for either equality or difference.
+//
+// This returns a Vec of block, where each block encrypts 1 or 0
+// depending of if all blocks matched with the comparison type with 0.
+//
+// E.g. For ZeroComparisonType::Equality, if all input blocks are zero
+// than all returned block will encrypt 1
+//
+// The returned Vec will have less block than the number of input blocks.
+// The returned blocks potentially needs to be 'reduced' to one block
+// with eg are_all_comparisons_block_true.
+//
+// This function exists because sometimes it is faster to concatenate
+// multiple vec of 'boolean' shortint block before reducing them with
+// are_all_comparisons_block_true
+template <typename Torus>
+__host__ void host_compare_with_zero_equality(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    int32_t num_radix_blocks) {
+
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // The idea is that we will sum chunks of blocks until carries are full
+  // then we compare the sum with 0.
+  //
+  // If all blocks were 0, the sum will be zero
+  // If at least one bock was not zero, the sum won't be zero
+  uint32_t total_modulus = message_modulus * carry_modulus;
+  uint32_t message_max = message_modulus - 1;
+
+  uint32_t num_elements_to_fill_carry = (total_modulus - 1) / message_max;
+
+  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  int num_sum_blocks = 0;
+  // Accumulator
+  auto sum = lwe_array_out;
+
+  if (num_radix_blocks == 1) {
+    // Just copy
+    cuda_memcpy_async_gpu_to_gpu(sum, lwe_array_in, big_lwe_size_bytes, stream);
+    num_sum_blocks = 1;
+  } else {
+    uint32_t remainder_blocks = num_radix_blocks;
+
+    auto sum_i = sum;
+    auto chunk = lwe_array_in;
+    while (remainder_blocks > 1) {
+      uint32_t chunk_size =
+          std::min(remainder_blocks, num_elements_to_fill_carry);
+
+      accumulate_all_blocks(stream, sum_i, chunk, big_lwe_dimension,
+                            chunk_size);
+
+      num_sum_blocks++;
+      remainder_blocks -= (chunk_size - 1);
+
+      // Update operands
+      chunk += chunk_size * big_lwe_size;
+      sum_i += big_lwe_size;
+    }
+  }
+
+  auto is_equal_to_zero_lut = mem_ptr->diff_buffer->is_zero_lut;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, sum, sum, bsk, ksk, num_sum_blocks, is_equal_to_zero_lut);
+  are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
+                                 num_sum_blocks);
+
+  // The result will be in the two first block. Everything else is
+  //  garbage.
+  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_equality_check_kb(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
+    Torus *lwe_array_2, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
+    Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto eq_buffer = mem_ptr->eq_buffer;
+
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+
+  // Applies the LUT for the comparison operation
+  auto comparisons = mem_ptr->tmp_block_comparisons;
+  integer_radix_apply_bivariate_lookup_table_kb(
+      stream, comparisons, lwe_array_1, lwe_array_2, bsk, ksk, num_radix_blocks,
+      eq_buffer->operator_lut);
+
+  // This takes a Vec of blocks, where each block is either 0 or 1.
+  //
+  // It return a block encrypting 1 if all input blocks are 1
+  // otherwise the block encrypts 0
+  are_all_comparisons_block_true(stream, lwe_array_out, comparisons, mem_ptr,
+                                 bsk, ksk, num_radix_blocks);
+
+  // Zero all blocks but the first
+  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_radix_equality_check_kb(
+    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_comparison_buffer<Torus>(
+      stream, op, params, num_radix_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void
+compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+                        Torus *lwe_array_left, Torus *lwe_array_right,
+                        int_comparison_buffer<Torus> *mem_ptr, void *bsk,
+                        Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
+  // be set to 1
+  // meaning that the output of the pbs will be the negative (modulo message
+  // space)
+  //
+  // Example:
+  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
+  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
+  // Since there was an overflow the bit of padding is 1 and not 0.
+  // When applying the LUT for an input value of 14 we would expect 1,
+  // but since the bit of padding is 1, we will get -1 modulus our message
+  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
+
+  // Subtract
+  // Here we need the true lwe sub, not the one that comes from shortint.
+  host_subtraction(stream, lwe_array_out, lwe_array_left, lwe_array_right,
+                   big_lwe_dimension, num_radix_blocks);
+
+  // Apply LUT to compare to 0
+  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
+  integer_radix_apply_univariate_lookup_table_kb(
+      stream, lwe_array_out, lwe_array_out, bsk, ksk, num_radix_blocks,
+      is_non_zero_lut);
+
+  // Add one
+  // Here Lhs can have the following values: (-1) % (message modulus * carry
+  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
+  host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
+                                            big_lwe_dimension, num_radix_blocks,
+                                            message_modulus, carry_modulus);
+}
+
+// Reduces a vec containing shortint blocks that encrypts a sign
+// (inferior, equal, superior) to one single shortint block containing the
+// final sign
+template <typename Torus>
+__host__ void
+tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
+                    Torus *lwe_block_comparisons,
+                    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
+                    std::function<Torus(Torus)> sign_handler_f, void *bsk,
+                    Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto params = tree_buffer->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // Tree reduction
+  // Reduces a vec containing shortint blocks that encrypts a sign
+  // (inferior, equal, superior) to one single shortint block containing the
+  // final sign
+  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  auto x = tree_buffer->tmp_x;
+  auto y = tree_buffer->tmp_y;
+  if (x != lwe_block_comparisons)
+    cuda_memcpy_async_gpu_to_gpu(x, lwe_block_comparisons,
+                                 big_lwe_size_bytes * num_radix_blocks, stream);
+
+  uint32_t partial_block_count = num_radix_blocks;
+
+  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
+  while (partial_block_count > 2) {
+    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
+
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, x, y, bsk, ksk, partial_block_count >> 1, inner_tree_leaf);
+
+    if ((partial_block_count % 2) != 0) {
+      partial_block_count >>= 1;
+      partial_block_count++;
+
+      auto last_y_block = y + (partial_block_count - 1) * big_lwe_size;
+      auto last_x_block = x + (partial_block_count - 1) * big_lwe_size;
+
+      cuda_memcpy_async_gpu_to_gpu(last_x_block, last_y_block,
+                                   big_lwe_size_bytes, stream);
+    } else {
+      partial_block_count >>= 1;
+    }
+  }
+
+  auto last_lut = tree_buffer->tree_last_leaf_lut;
+  auto block_selector_f = tree_buffer->block_selector_f;
+  std::function<Torus(Torus)> f;
+
+  if (partial_block_count == 2) {
+    pack_blocks(stream, y, x, big_lwe_dimension, partial_block_count, 4);
+
+    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
+      int msb = (x >> 2) & 3;
+      int lsb = x & 3;
+
+      int final_sign = block_selector_f(msb, lsb);
+      return sign_handler_f(final_sign);
+    };
+  } else {
+    // partial_block_count == 1
+    y = x;
+    f = sign_handler_f;
+  }
+  generate_device_accumulator<Torus>(stream, last_lut->lut, glwe_dimension,
+                                     polynomial_size, message_modulus,
+                                     carry_modulus, f);
+
+  // Last leaf
+  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out, y, bsk,
+                                                 ksk, 1, last_lut);
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_difference_check_kb(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_left,
+    Torus *lwe_array_right, int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> reduction_lut_f, void *bsk, Torus *ksk,
+    uint32_t total_num_radix_blocks) {
+
+  auto diff_buffer = mem_ptr->diff_buffer;
+
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  uint32_t num_radix_blocks = total_num_radix_blocks;
+  auto lhs = lwe_array_left;
+  auto rhs = lwe_array_right;
+  if (carry_modulus == message_modulus) {
+    // Packing is possible
+    // Pack inputs
+    Torus *packed_left = diff_buffer->tmp_packed_left;
+    Torus *packed_right = diff_buffer->tmp_packed_right;
+    pack_blocks(stream, packed_left, lwe_array_left, big_lwe_dimension,
+                num_radix_blocks, message_modulus);
+    pack_blocks(stream, packed_right, lwe_array_right, big_lwe_dimension,
+                num_radix_blocks, message_modulus);
+    // From this point we have half number of blocks
+    num_radix_blocks /= 2;
+
+    // Clean noise
+    auto cleaning_lut = mem_ptr->cleaning_lut;
+    integer_radix_apply_univariate_lookup_table_kb(
+        stream, packed_left, packed_left, bsk, ksk, num_radix_blocks,
+        cleaning_lut);
+    integer_radix_apply_univariate_lookup_table_kb(
+        stream, packed_right, packed_right, bsk, ksk, num_radix_blocks,
+        cleaning_lut);
+
+    lhs = packed_left;
+    rhs = packed_right;
+  }
+
+  // comparisons will be assigned
+  // - 0 if lhs < rhs
+  // - 1 if lhs == rhs
+  // - 2 if lhs > rhs
+  auto comparisons = mem_ptr->tmp_block_comparisons;
+  compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk, ksk,
+                          num_radix_blocks);
+
+  // Reduces a vec containing radix blocks that encrypts a sign
+  // (inferior, equal, superior) to one single radix block containing the
+  // final sign
+  tree_sign_reduction(stream, lwe_array_out, comparisons,
+                      mem_ptr->diff_buffer->tree_buffer, reduction_lut_f, bsk,
+                      ksk, num_radix_blocks);
+
+  // The result will be in the first block. Everything else is garbage.
+  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                    (total_num_radix_blocks - 1) * big_lwe_size_bytes, stream);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_radix_difference_check_kb(
+    cuda_stream_t *stream, int_comparison_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_comparison_buffer<Torus>(
+      stream, op, params, num_radix_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void
+host_integer_radix_maxmin_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+                             Torus *lwe_array_left, Torus *lwe_array_right,
+                             int_comparison_buffer<Torus> *mem_ptr, void *bsk,
+                             Torus *ksk, uint32_t total_num_radix_blocks) {
+
+  // Compute the sign
+  host_integer_radix_difference_check_kb(
+      stream, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
+      mem_ptr, mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks);
+
+  // Selector
+  host_integer_radix_cmux_kb(
+      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
+      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -0,0 +1,127 @@
+#include "integer/integer.cuh"
+#include <linear_algebra.h>
+
+void cuda_full_propagation_64_inplace(
+    cuda_stream_t *stream, void *input_blocks, int8_t *mem_ptr, void *ksk,
+    void *bsk, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t grouping_factor,
+    uint32_t num_blocks) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<256>>(
+        stream, static_cast<uint64_t *>(input_blocks),
+        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
+        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
+        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
+    break;
+  case 512:
+    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<512>>(
+        stream, static_cast<uint64_t *>(input_blocks),
+        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
+        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
+        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
+    break;
+  case 1024:
+    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<1024>>(
+        stream, static_cast<uint64_t *>(input_blocks),
+        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
+        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
+        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
+    break;
+  case 2048:
+    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<2048>>(
+        stream, static_cast<uint64_t *>(input_blocks),
+        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
+        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
+        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
+    break;
+  case 4096:
+    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<4096>>(
+        stream, static_cast<uint64_t *>(input_blocks),
+        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
+        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
+        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
+    break;
+  case 8192:
+    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<8192>>(
+        stream, static_cast<uint64_t *>(input_blocks),
+        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
+        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
+        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
+    break;
+  case 16384:
+    host_full_propagate_inplace<uint64_t, int64_t, AmortizedDegree<16384>>(
+        stream, static_cast<uint64_t *>(input_blocks),
+        (int_fullprop_buffer<uint64_t> *)mem_ptr, static_cast<uint64_t *>(ksk),
+        bsk, lwe_dimension, glwe_dimension, polynomial_size, ks_base_log,
+        ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
+    break;
+  default:
+    break;
+  }
+}
+
+void scratch_cuda_full_propagation_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  scratch_cuda_full_propagation<uint64_t>(
+      stream, (int_fullprop_buffer<uint64_t> **)mem_ptr, lwe_dimension,
+      glwe_dimension, polynomial_size, level_count, grouping_factor,
+      input_lwe_ciphertext_count, message_modulus, carry_modulus, pbs_type,
+      allocate_gpu_memory);
+}
+
+void cleanup_cuda_full_propagation(cuda_stream_t *stream,
+                                   int8_t **mem_ptr_void) {
+
+  int_fullprop_buffer<uint64_t> *mem_ptr =
+      (int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
+
+  cuda_drop_async(mem_ptr->lut_buffer, stream);
+  cuda_drop_async(mem_ptr->lut_indexes, stream);
+
+  cuda_drop_async(mem_ptr->pbs_buffer, stream);
+
+  cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
+  cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
+}
+
+void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
+      stream, (int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
+      allocate_gpu_memory);
+}
+
+void cuda_propagate_single_carry_low_latency_kb_64_inplace(
+    cuda_stream_t *stream, void *lwe_array, int8_t *mem_ptr, void *bsk,
+    void *ksk, uint32_t num_blocks) {
+  host_propagate_single_carry_low_latency<uint64_t>(
+      stream, static_cast<uint64_t *>(lwe_array),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsk,
+      static_cast<uint64_t *>(ksk), num_blocks);
+}
+
+void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
+                                                     int8_t **mem_ptr_void) {
+  int_sc_prop_memory<uint64_t> *mem_ptr =
+      (int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -0,0 +1,677 @@
+#ifndef CUDA_INTEGER_CUH
+#define CUDA_INTEGER_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/scalar_addition.cuh"
+#include "linear_algebra.h"
+#include "linearalgebra/addition.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <functional>
+
+template <typename Torus>
+void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
+                 Torus *lwe_output_indexes, Torus *lut_vector,
+                 Torus *lut_vector_indexes, Torus *lwe_array_in,
+                 Torus *lwe_input_indexes, void *bootstrapping_key,
+                 int8_t *pbs_buffer, uint32_t glwe_dimension,
+                 uint32_t lwe_dimension, uint32_t polynomial_size,
+                 uint32_t base_log, uint32_t level_count,
+                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+                 uint32_t num_luts, uint32_t lwe_idx,
+                 uint32_t max_shared_memory, PBS_TYPE pbs_type) {
+  if (sizeof(Torus) == sizeof(uint32_t)) {
+    // 32 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      printf("multibit\n");
+      printf("Error: 32-bit multibit PBS is not supported.\n");
+      break;
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      break;
+    }
+  } else {
+    // 64 bits
+    switch (pbs_type) {
+    case MULTI_BIT:
+      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, grouping_factor, base_log, level_count,
+          input_lwe_ciphertext_count, num_luts, lwe_idx,
+          max_shared_memory);
+      break;
+    case LOW_LAT:
+      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    case AMORTIZED:
+      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+          stream, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+          bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
+          polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
+          num_luts, lwe_idx, max_shared_memory);
+      break;
+    default:
+      break;
+    }
+  }
+}
+
+// function rotates right  radix ciphertext with specific value
+// grid is one dimensional
+// blockIdx.x represents x_th block of radix ciphertext
+template <typename Torus>
+__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
+                                          uint32_t value, uint32_t blocks_count,
+                                          uint32_t lwe_size) {
+  value %= blocks_count;
+
+  size_t tid = threadIdx.x;
+  size_t src_block_id = blockIdx.x;
+  size_t dst_block_id = (src_block_id + value) % blocks_count;
+  size_t stride = blockDim.x;
+
+  auto cur_src_block = &src[src_block_id * lwe_size];
+  auto cur_dst_block = &dst[dst_block_id * lwe_size];
+
+  for (size_t i = tid; i < lwe_size; i += stride) {
+    cur_dst_block[i] = cur_src_block[i];
+  }
+}
+
+// function rotates left  radix ciphertext with specific value
+// grid is one dimensional
+// blockIdx.x represents x_th block of radix ciphertext
+template <typename Torus>
+__global__ void radix_blocks_rotate_left(Torus *dst, Torus *src, uint32_t value,
+                                         uint32_t blocks_count,
+                                         uint32_t lwe_size) {
+  value %= blocks_count;
+  size_t src_block_id = blockIdx.x;
+
+  size_t tid = threadIdx.x;
+  size_t dst_block_id = (src_block_id >= value)
+                            ? src_block_id - value
+                            : src_block_id - value + blocks_count;
+  size_t stride = blockDim.x;
+
+  auto cur_src_block = &src[src_block_id * lwe_size];
+  auto cur_dst_block = &dst[dst_block_id * lwe_size];
+
+  for (size_t i = tid; i < lwe_size; i += stride) {
+    cur_dst_block[i] = cur_src_block[i];
+  }
+}
+
+// polynomial_size threads
+template <typename Torus>
+__global__ void
+device_pack_bivariate_blocks(Torus *lwe_array_out, Torus *lwe_array_1,
+                             Torus *lwe_array_2, Torus *lwe_indexes,
+                             uint32_t lwe_dimension, uint32_t message_modulus,
+                             uint32_t num_blocks) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tid < num_blocks * (lwe_dimension + 1)) {
+    int block_id = tid / (lwe_dimension + 1);
+    int coeff_id = tid % (lwe_dimension + 1);
+
+    int pos = lwe_indexes[block_id] * (lwe_dimension + 1) + coeff_id;
+    lwe_array_out[pos] = lwe_array_1[pos] * message_modulus + lwe_array_2[pos];
+  }
+}
+
+template <typename Torus>
+__host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
+                                    Torus *lwe_array_1, Torus *lwe_array_2,
+                                    Torus *lwe_indexes, uint32_t lwe_dimension,
+                                    uint32_t message_modulus,
+                                    uint32_t num_radix_blocks) {
+
+  // Left message is shifted
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = num_radix_blocks * (lwe_dimension + 1);
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
+      lwe_array_out, lwe_array_1, lwe_array_2, lwe_indexes, lwe_dimension,
+      message_modulus, num_radix_blocks);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__host__ void integer_radix_apply_univariate_lookup_table_kb(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
+    Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
+  // apply_lookup_table
+  auto params = lut->params;
+  auto pbs_type = params.pbs_type;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
+
+  // Compute Keyswitch-PBS
+  cuda_keyswitch_lwe_ciphertext_vector(
+      stream, lut->tmp_lwe_after_ks, lut->lwe_indexes, lwe_array_in,
+      lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
+      ks_base_log, ks_level, num_radix_blocks);
+
+  execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
+              lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
+              lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
+              polynomial_size, pbs_base_log, pbs_level, grouping_factor,
+              num_radix_blocks, 1, 0,
+              cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
+}
+
+template <typename Torus>
+__host__ void integer_radix_apply_bivariate_lookup_table_kb(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
+    Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
+    int_radix_lut<Torus> *lut) {
+  // apply_lookup_table_bivariate
+
+  auto params = lut->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto message_modulus = params.message_modulus;
+
+  // Left message is shifted
+  pack_bivariate_blocks(stream, lut->tmp_lwe_before_ks, lwe_array_1,
+                        lwe_array_2, lut->lwe_indexes, big_lwe_dimension,
+                        message_modulus, num_radix_blocks);
+  check_cuda_error(cudaGetLastError());
+
+  // Apply LUT
+  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
+                                                 lut->tmp_lwe_before_ks, bsk,
+                                                 ksk, num_radix_blocks, lut);
+}
+
+// Rotates the slice in-place such that the first mid elements of the slice move
+// to the end while the last array_length elements move to the front. After
+// calling rotate_left, the element previously at index mid will become the
+// first element in the slice.
+template <typename Torus>
+void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
+  mid = mid % array_length;
+
+  std::rotate(buffer, buffer + mid, buffer + array_length);
+}
+
+template <typename Torus>
+void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
+                           uint32_t polynomial_size, uint32_t message_modulus,
+                           uint32_t carry_modulus,
+                           std::function<Torus(Torus)> f) {
+
+  uint32_t modulus_sup = message_modulus * carry_modulus;
+  uint32_t box_size = polynomial_size / modulus_sup;
+  Torus delta = (1ul << 63) / modulus_sup;
+
+  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
+
+  auto body = &acc[glwe_dimension * polynomial_size];
+
+  // This accumulator extracts the carry bits
+  for (int i = 0; i < modulus_sup; i++) {
+    int index = i * box_size;
+    for (int j = index; j < index + box_size; j++) {
+      auto f_eval = f(i);
+      body[j] = f_eval * delta;
+    }
+  }
+
+  int half_box_size = box_size / 2;
+
+  // Negate the first half_box_size coefficients
+  for (int i = 0; i < half_box_size; i++) {
+    body[i] = -body[i];
+  }
+
+  rotate_left(body, half_box_size, polynomial_size);
+}
+
+template <typename Torus>
+void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension,
+                                     uint32_t polynomial_size,
+                                     uint32_t message_modulus,
+                                     uint32_t carry_modulus,
+                                     std::function<Torus(Torus, Torus)> f) {
+
+  Torus factor_u64 = message_modulus;
+  auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus {
+    Torus lhs = (input / factor_u64) % message_modulus;
+    Torus rhs = (input % factor_u64) % message_modulus;
+
+    return f(lhs, rhs);
+  };
+
+  generate_lookup_table<Torus>(acc, glwe_dimension, polynomial_size,
+                               message_modulus, carry_modulus, wrapped_f);
+}
+
+/*
+ *  generate bivariate accumulator for device pointer
+ *    v_stream - cuda stream
+ *    acc - device pointer for bivariate accumulator
+ *    ...
+ *    f - wrapping function with two Torus inputs
+ */
+template <typename Torus>
+void generate_device_accumulator_bivariate(
+    cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
+    std::function<Torus(Torus, Torus)> f) {
+
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill bivariate accumulator
+  generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
+                                         message_modulus, carry_modulus, f);
+
+  // copy host lut and lut_indexes to device
+  cuda_memcpy_async_to_gpu(
+      acc_bivariate, h_lut,
+      (glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
+
+  cuda_synchronize_stream(stream);
+  free(h_lut);
+}
+
+/*
+ *  generate bivariate accumulator for device pointer
+ *    v_stream - cuda stream
+ *    acc - device pointer for accumulator
+ *    ...
+ *    f - evaluating function with one Torus input
+ */
+template <typename Torus>
+void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size,
+                                 uint32_t message_modulus,
+                                 uint32_t carry_modulus,
+                                 std::function<Torus(Torus)> f) {
+
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill accumulator
+  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
+                               message_modulus, carry_modulus, f);
+
+  // copy host lut and lut_indexes to device
+  cuda_memcpy_async_to_gpu(
+      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
+      stream);
+
+  cuda_synchronize_stream(stream);
+  free(h_lut);
+}
+
+template <typename Torus>
+void scratch_cuda_propagate_single_carry_low_latency_kb_inplace(
+    cuda_stream_t *stream, int_sc_prop_memory<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_sc_prop_memory<Torus>(stream, params, num_radix_blocks,
+                                           allocate_gpu_memory);
+}
+
+template <typename Torus>
+void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
+                                             Torus *lwe_array,
+                                             int_sc_prop_memory<Torus> *mem,
+                                             void *bsk, Torus *ksk,
+                                             uint32_t num_blocks) {
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto big_lwe_size = glwe_dimension * polynomial_size + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  auto generates_or_propagates = mem->generates_or_propagates;
+  auto step_output = mem->step_output;
+
+  auto luts_array = mem->luts_array;
+  auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
+  auto message_acc = mem->message_acc;
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
+      luts_array);
+
+  // compute prefix sum with hillis&steele
+
+  int num_steps = ceil(log2((double)num_blocks));
+  int space = 1;
+  cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
+                               big_lwe_size_bytes * num_blocks, stream);
+
+  for (int step = 0; step < num_steps; step++) {
+    auto cur_blocks = &step_output[space * big_lwe_size];
+    auto prev_blocks = generates_or_propagates;
+    int cur_total_blocks = num_blocks - space;
+
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
+        luts_carry_propagation_sum);
+
+    cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
+                                 cur_blocks,
+                                 big_lwe_size_bytes * cur_total_blocks, stream);
+    space *= 2;
+  }
+
+  radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
+      step_output, generates_or_propagates, 1, num_blocks, big_lwe_size);
+  cuda_memset_async(step_output, 0, big_lwe_size_bytes, stream);
+
+  host_addition(stream, lwe_array, lwe_array, step_output,
+                glwe_dimension * polynomial_size, num_blocks);
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, lwe_array, lwe_array, bsk, ksk, num_blocks, message_acc);
+}
+
+/*
+ * input_blocks: input radix ciphertext propagation will happen inplace
+ * acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
+ * lut_indexes_message_carry: lut_indexes for message and carry, should always be  {0, 1}
+ * small_lwe_vector: output of keyswitch should have
+ *     size = 2 * (lwe_dimension + 1) * sizeof(Torus)
+ * big_lwe_vector: output of pbs should have
+ *     size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
+ */
+template <typename Torus, typename STorus, class params>
+void host_full_propagate_inplace(cuda_stream_t *stream, Torus *input_blocks,
+                                 int_fullprop_buffer<Torus> *mem_ptr,
+                                 Torus *ksk, void *bsk, uint32_t lwe_dimension,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size, uint32_t ks_base_log,
+                                 uint32_t ks_level, uint32_t pbs_base_log,
+                                 uint32_t pbs_level, uint32_t grouping_factor,
+                                 uint32_t num_blocks) {
+
+  int big_lwe_size = (glwe_dimension * polynomial_size + 1);
+  int small_lwe_size = (lwe_dimension + 1);
+
+  for (int i = 0; i < num_blocks; i++) {
+    auto cur_input_block = &input_blocks[i * big_lwe_size];
+
+    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+        stream, mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes,
+        cur_input_block, mem_ptr->lwe_indexes, ksk,
+        polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
+        1);
+
+    cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
+                                 mem_ptr->tmp_small_lwe_vector,
+                                 small_lwe_size * sizeof(Torus), stream);
+
+    execute_pbs<Torus>(
+        stream, mem_ptr->tmp_big_lwe_vector, mem_ptr->lwe_indexes,
+        mem_ptr->lut_buffer, mem_ptr->lut_indexes,
+        mem_ptr->tmp_small_lwe_vector, mem_ptr->lwe_indexes, bsk,
+        mem_ptr->pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
+        pbs_base_log, pbs_level, grouping_factor, 2, 2, 0,
+        cuda_get_max_shared_memory(stream->gpu_index), mem_ptr->pbs_type);
+
+    cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
+                                 big_lwe_size * sizeof(Torus), stream);
+
+    if (i < num_blocks - 1) {
+      auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
+      host_addition(stream, next_input_block, next_input_block,
+                    &mem_ptr->tmp_big_lwe_vector[big_lwe_size],
+                    glwe_dimension * polynomial_size, 1);
+    }
+  }
+}
+
+template <typename Torus>
+void scratch_cuda_full_propagation(
+    cuda_stream_t *stream, int_fullprop_buffer<Torus> **mem_ptr,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t pbs_level, uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  // PBS
+  int8_t *pbs_buffer;
+  if (pbs_type == MULTI_BIT) {
+    uint32_t lwe_chunk_size = get_average_lwe_chunk_size(
+        lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks);
+    // Only 64 bits is supported
+    scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
+                                  glwe_dimension, polynomial_size, pbs_level,
+                                  grouping_factor, num_radix_blocks,
+                                  cuda_get_max_shared_memory(stream->gpu_index),
+                                  allocate_gpu_memory, lwe_chunk_size);
+  } else {
+    // Classic
+    // We only use low latency for classic mode
+    if (sizeof(Torus) == sizeof(uint32_t))
+      scratch_cuda_bootstrap_low_latency_32(
+          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
+          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
+          allocate_gpu_memory);
+    else
+      scratch_cuda_bootstrap_low_latency_64(
+          stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
+          num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
+          allocate_gpu_memory);
+  }
+
+  // LUT
+  Torus *lut_buffer;
+  if (allocate_gpu_memory) {
+    // LUT is used as a trivial encryption, so we only allocate memory for the
+    // body
+    Torus lut_buffer_size =
+        2 * (glwe_dimension + 1) * polynomial_size * sizeof(Torus);
+
+    lut_buffer = (Torus *)cuda_malloc_async(lut_buffer_size, stream);
+
+    // LUTs
+    auto lut_f_message = [message_modulus](Torus x) -> Torus {
+      return x % message_modulus;
+    };
+    auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+      return x / message_modulus;
+    };
+
+    //
+    Torus *lut_buffer_message = lut_buffer;
+    Torus *lut_buffer_carry =
+        lut_buffer + (glwe_dimension + 1) * polynomial_size;
+
+    generate_device_accumulator<Torus>(
+        stream, lut_buffer_message, glwe_dimension, polynomial_size,
+        message_modulus, carry_modulus, lut_f_message);
+
+    generate_device_accumulator<Torus>(stream, lut_buffer_carry, glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, lut_f_carry);
+  }
+
+  Torus *lut_indexes;
+  if (allocate_gpu_memory) {
+    lut_indexes = (Torus *)cuda_malloc_async(2 * sizeof(Torus), stream);
+
+    Torus h_lut_indexes[2] = {0, 1};
+    cuda_memcpy_async_to_gpu(lut_indexes, h_lut_indexes, 2 * sizeof(Torus),
+                             stream);
+  }
+
+  Torus *lwe_indexes;
+  if (allocate_gpu_memory) {
+    Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
+
+    lwe_indexes = (Torus *)cuda_malloc_async(lwe_indexes_size, stream);
+    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
+    for (int i = 0; i < num_radix_blocks; i++)
+      h_lwe_indexes[i] = i;
+    cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
+                             stream);
+    cuda_synchronize_stream(stream);
+    free(h_lwe_indexes);
+  }
+
+  // Temporary arrays
+  Torus *small_lwe_vector;
+  Torus *big_lwe_vector;
+  if (allocate_gpu_memory) {
+    Torus small_vector_size = 2 * (lwe_dimension + 1) * sizeof(Torus);
+    Torus big_vector_size =
+        2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus);
+
+    small_lwe_vector = (Torus *)cuda_malloc_async(small_vector_size, stream);
+    big_lwe_vector = (Torus *)cuda_malloc_async(big_vector_size, stream);
+  }
+
+  *mem_ptr = new int_fullprop_buffer<Torus>;
+
+  (*mem_ptr)->pbs_type = pbs_type;
+  (*mem_ptr)->pbs_buffer = pbs_buffer;
+
+  (*mem_ptr)->lut_buffer = lut_buffer;
+  (*mem_ptr)->lut_indexes = lut_indexes;
+  (*mem_ptr)->lwe_indexes = lwe_indexes;
+
+  (*mem_ptr)->tmp_small_lwe_vector = small_lwe_vector;
+  (*mem_ptr)->tmp_big_lwe_vector = big_lwe_vector;
+}
+
+// (lwe_dimension+1) threads
+// (num_radix_blocks / 2) thread blocks
+template <typename Torus>
+__global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
+                                   uint32_t lwe_dimension,
+                                   uint32_t num_radix_blocks, uint32_t factor) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tid < (lwe_dimension + 1)) {
+    for (int bid = 0; bid < (num_radix_blocks / 2); bid++) {
+      Torus *lsb_block = lwe_array_in + (2 * bid) * (lwe_dimension + 1);
+      Torus *msb_block = lsb_block + (lwe_dimension + 1);
+
+      Torus *packed_block = lwe_array_out + bid * (lwe_dimension + 1);
+
+      packed_block[tid] = lsb_block[tid] + factor * msb_block[tid];
+    }
+
+    if (num_radix_blocks % 2 != 0) {
+      // We couldn't pack the last block, so we just copy it
+      Torus *lsb_block =
+          lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
+      Torus *last_block =
+          lwe_array_out + (num_radix_blocks / 2) * (lwe_dimension + 1);
+
+      last_block[tid] = lsb_block[tid];
+    }
+  }
+}
+
+// Packs the low ciphertext in the message parts of the high ciphertext
+// and moves the high ciphertext into the carry part.
+//
+// This requires the block parameters to have enough room for two ciphertexts,
+// so at least as many carry modulus as the message modulus
+//
+// Expects the carry buffer to be empty
+template <typename Torus>
+__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
+                          Torus *lwe_array_in, uint32_t lwe_dimension,
+                          uint32_t num_radix_blocks, uint32_t factor) {
+  assert(lwe_array_out != lwe_array_in);
+
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = (lwe_dimension + 1);
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  device_pack_blocks<<<num_blocks, num_threads, 0, stream->stream>>>(
+      lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
+}
+
+template <typename Torus>
+__global__ void
+device_create_trivial_radix(Torus *lwe_array, Torus *scalar_input,
+                            int32_t num_blocks, uint32_t lwe_dimension,
+                            uint64_t delta) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < num_blocks) {
+    Torus scalar = scalar_input[tid];
+    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
+
+    *body = scalar * delta;
+  }
+}
+
+template <typename Torus>
+__host__ void
+create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
+                     Torus *scalar_array, uint32_t lwe_dimension,
+                     uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
+                     uint64_t message_modulus, uint64_t carry_modulus) {
+
+  size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
+  cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);
+
+  if (num_scalar_blocks == 0)
+    return;
+
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = num_scalar_blocks;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  // Value of the shift we multiply our messages by
+  // If message_modulus and carry_modulus are always powers of 2 we can simplify
+  // this
+  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
+
+  device_create_trivial_radix<<<grid, thds, 0, stream->stream>>>(
+      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -0,0 +1,107 @@
+#include "integer/multiplication.cuh"
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the integer radix multiplication in keyswitch->bootstrap order.
+ */
+void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t message_modulus,
+    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
+    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
+    uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          polynomial_size, lwe_dimension, ks_level, ks_base_log,
+                          pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  switch (polynomial_size) {
+  case 2048:
+    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t>(
+        stream, (int_mul_memory<uint64_t> **)mem_ptr, num_radix_blocks, params,
+        allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/*
+ * Computes a multiplication between two 64 bit radix lwe ciphertexts
+ * encrypting integer values. keyswitch -> bootstrap pattern is used, function
+ * works for single pair of radix ciphertexts, 'v_stream' can be used for
+ * parallelization
+ * - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - 'gpu_index' is the index of the GPU to be used in the kernel launch
+ * - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
+ * multiplication
+ * - 'radix_lwe_left' left radix big lwe ciphertext
+ * - 'radix_lwe_right' right radix big lwe ciphertext
+ * - 'bsk' bootstrapping key in fourier domain
+ * - 'ksk' keyswitching key
+ * - 'mem_ptr'
+ * - 'message_modulus' message_modulus
+ * - 'carry_modulus' carry_modulus
+ * - 'glwe_dimension' glwe_dimension
+ * - 'lwe_dimension' is the dimension of small lwe ciphertext
+ * - 'polynomial_size' polynomial size
+ * - 'pbs_base_log' base log used in the pbs
+ * - 'pbs_level' decomposition level count used in the pbs
+ * - 'ks_level' decomposition level count used in the keyswitch
+ * - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
+ * ciphertext
+ * - 'pbs_type' selects which PBS implementation should be used
+ * - 'max_shared_memory' maximum shared memory per cuda block
+ */
+void cuda_integer_mult_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *radix_lwe_out, void *radix_lwe_left,
+    void *radix_lwe_right, void *bsk, void *ksk, int8_t *mem_ptr,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
+    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
+    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
+    uint32_t max_shared_memory) {
+
+  switch (polynomial_size) {
+  case 2048:
+    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
+        stream, static_cast<uint64_t *>(radix_lwe_out),
+        static_cast<uint64_t *>(radix_lwe_left),
+        static_cast<uint64_t *>(radix_lwe_right), bsk,
+        static_cast<uint64_t *>(ksk), (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
+    break;
+  default:
+    break;
+  }
+}
+
+void cleanup_cuda_integer_mult(cuda_stream_t *stream, int8_t **mem_ptr_void) {
+
+  int_mul_memory<uint64_t> *mem_ptr =
+      (int_mul_memory<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(stream);
+}
+
+void cuda_small_scalar_multiplication_integer_radix_ciphertext_64_inplace(
+    cuda_stream_t *stream, void *lwe_array, uint64_t scalar,
+    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
+
+  cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
+      stream, lwe_array, lwe_array, scalar, lwe_dimension,
+      lwe_ciphertext_count);
+}
+
+void cuda_small_scalar_multiplication_integer_radix_ciphertext_64(
+    cuda_stream_t *stream, void *output_lwe_array, void *input_lwe_array,
+    uint64_t scalar, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
+
+  host_integer_small_scalar_mult_radix(
+      stream, static_cast<uint64_t *>(output_lwe_array),
+      static_cast<uint64_t *>(input_lwe_array), scalar, lwe_dimension,
+      lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -0,0 +1,639 @@
+#ifndef CUDA_INTEGER_MULT_CUH
+#define CUDA_INTEGER_MULT_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "bootstrap.h"
+#include "bootstrap_multibit.h"
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/integer.cuh"
+#include "linear_algebra.h"
+#include "pbs/bootstrap_amortized.cuh"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <fstream>
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+template <typename Torus, class params>
+__global__ void
+all_shifted_lhs_rhs(Torus *radix_lwe_left, Torus *lsb_ciphertext,
+                    Torus *msb_ciphertext, Torus *radix_lwe_right,
+                    Torus *lsb_rhs, Torus *msb_rhs, int num_blocks) {
+
+  size_t block_id = blockIdx.x;
+  double D = sqrt((2 * num_blocks + 1) * (2 * num_blocks + 1) - 8 * block_id);
+  size_t radix_id = int((2 * num_blocks + 1 - D) / 2.);
+  size_t local_block_id =
+      block_id - (2 * num_blocks - radix_id + 1) / 2. * radix_id;
+  bool process_msb = (local_block_id < (num_blocks - radix_id - 1));
+  auto cur_lsb_block = &lsb_ciphertext[block_id * (params::degree + 1)];
+  auto cur_msb_block =
+      (process_msb)
+          ? &msb_ciphertext[(block_id - radix_id) * (params::degree + 1)]
+          : nullptr;
+
+  auto cur_lsb_rhs_block = &lsb_rhs[block_id * (params::degree + 1)];
+  auto cur_msb_rhs_block =
+      (process_msb) ? &msb_rhs[(block_id - radix_id) * (params::degree + 1)]
+                    : nullptr;
+
+  auto cur_ct_right = &radix_lwe_right[radix_id * (params::degree + 1)];
+  auto cur_src = &radix_lwe_left[local_block_id * (params::degree + 1)];
+
+  size_t tid = threadIdx.x;
+
+  for (int i = 0; i < params::opt; i++) {
+    Torus value = cur_src[tid];
+    if (process_msb) {
+      cur_lsb_block[tid] = cur_msb_block[tid] = value;
+      cur_lsb_rhs_block[tid] = cur_msb_rhs_block[tid] = cur_ct_right[tid];
+    } else {
+      cur_lsb_block[tid] = value;
+      cur_lsb_rhs_block[tid] = cur_ct_right[tid];
+    }
+    tid += params::degree / params::opt;
+  }
+  if (threadIdx.x == 0) {
+    Torus value = cur_src[params::degree];
+    if (process_msb) {
+      cur_lsb_block[params::degree] = cur_msb_block[params::degree] = value;
+      cur_lsb_rhs_block[params::degree] = cur_msb_rhs_block[params::degree] =
+          cur_ct_right[params::degree];
+    } else {
+      cur_lsb_block[params::degree] = value;
+      cur_lsb_rhs_block[params::degree] = cur_ct_right[params::degree];
+    }
+  }
+}
+
+template <typename Torus>
+void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
+                                    Torus *dst, int *S, int *F, int num_blocks,
+                                    uint32_t map_size, uint32_t unit_size,
+                                    int &total_copied, bool is_message) {
+  for (int i = 0; i < map_size; i++) {
+    int s_index = i * num_blocks + S[i];
+    int number_of_unit = F[i] - S[i] + is_message;
+    auto cur_dst = &dst[total_copied * unit_size];
+    auto cur_src = &src[s_index * unit_size];
+    size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
+    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
+    total_copied += number_of_unit;
+  }
+}
+
+template <typename Torus>
+void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
+                                         Torus *dst, int *S, int *F,
+                                         uint32_t map_size, uint32_t unit_size,
+                                         int &total_copied,
+                                         int &total_radix_copied,
+                                         int num_blocks, bool is_message) {
+  size_t radix_size = unit_size * num_blocks;
+  for (int i = 0; i < map_size; i++) {
+    auto cur_dst_radix = &dst[total_radix_copied * radix_size];
+
+    int s_index = S[i];
+    int number_of_unit = F[i] - s_index + is_message;
+
+    if (!is_message) {
+      int zero_block_count = num_blocks - number_of_unit;
+      cuda_memset_async(cur_dst_radix, 0,
+                        zero_block_count * unit_size * sizeof(Torus), stream);
+      s_index = zero_block_count;
+    }
+
+    auto cur_dst = &cur_dst_radix[s_index * unit_size];
+    auto cur_src = &src[total_copied * unit_size];
+
+    size_t copy_size = unit_size * number_of_unit * sizeof(Torus);
+    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
+    total_copied += number_of_unit;
+    ++total_radix_copied;
+  }
+}
+
+template <typename Torus, class params>
+__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
+                                uint32_t chunk_size, uint32_t num_blocks) {
+
+  extern __shared__ Torus result[];
+  size_t chunk_id = blockIdx.x;
+  size_t chunk_elem_size = chunk_size * num_blocks * (params::degree + 1);
+  size_t radix_elem_size = num_blocks * (params::degree + 1);
+  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
+  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
+  size_t block_stride = blockIdx.y * (params::degree + 1);
+  auto dst_block = &dst_radix[block_stride];
+
+  // init shared mem with first radix of chunk
+  size_t tid = threadIdx.x;
+  for (int i = 0; i < params::opt; i++) {
+    result[tid] = src_chunk[block_stride + tid];
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    result[params::degree] = src_chunk[block_stride + params::degree];
+  }
+
+  // accumulate rest  of the radixes
+  for (int r_id = 1; r_id < chunk_size; r_id++) {
+    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      result[tid] += cur_src_radix[block_stride + tid];
+      tid += params::degree / params::opt;
+    }
+    if (threadIdx.x == 0) {
+      result[params::degree] += cur_src_radix[block_stride + params::degree];
+    }
+  }
+
+  // put result from shared mem to global mem
+  tid = threadIdx.x;
+  for (int i = 0; i < params::opt; i++) {
+    dst_block[tid] = result[tid];
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    dst_block[params::degree] = result[params::degree];
+  }
+}
+
+template <typename Torus, class params>
+__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
+                                        Torus *msb_blocks,
+                                        uint32_t glwe_dimension,
+                                        uint32_t lsb_count, uint32_t msb_count,
+                                        uint32_t num_blocks) {
+  size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
+  size_t big_lwe_id = blockIdx.x;
+  size_t radix_id = big_lwe_id / num_blocks;
+  size_t block_id = big_lwe_id % num_blocks;
+  size_t lsb_block_id = block_id - radix_id;
+  size_t msb_block_id = block_id - radix_id - 1;
+
+  bool process_lsb = (radix_id <= block_id);
+  bool process_msb = (radix_id + 1 <= block_id);
+
+  auto cur_res_lsb_ct = &result_blocks[big_lwe_id * big_lwe_dimension];
+  auto cur_res_msb_ct =
+      &result_blocks[num_blocks * num_blocks * big_lwe_dimension +
+                     big_lwe_id * big_lwe_dimension];
+  Torus *cur_lsb_radix = &lsb_blocks[(2 * num_blocks - radix_id + 1) *
+                                     radix_id / 2 * (params::degree + 1)];
+  Torus *cur_msb_radix = (process_msb)
+                             ? &msb_blocks[(2 * num_blocks - radix_id - 1) *
+                                           radix_id / 2 * (params::degree + 1)]
+                             : nullptr;
+  Torus *cur_lsb_ct = (process_lsb)
+                          ? &cur_lsb_radix[lsb_block_id * (params::degree + 1)]
+                          : nullptr;
+  Torus *cur_msb_ct = (process_msb)
+                          ? &cur_msb_radix[msb_block_id * (params::degree + 1)]
+                          : nullptr;
+  size_t tid = threadIdx.x;
+
+  for (int i = 0; i < params::opt; i++) {
+    cur_res_lsb_ct[tid] = (process_lsb) ? cur_lsb_ct[tid] : 0;
+    cur_res_msb_ct[tid] = (process_msb) ? cur_msb_ct[tid] : 0;
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    cur_res_lsb_ct[params::degree] =
+        (process_lsb) ? cur_lsb_ct[params::degree] : 0;
+    cur_res_msb_ct[params::degree] =
+        (process_msb) ? cur_msb_ct[params::degree] : 0;
+  }
+}
+
+template <typename Torus, typename STorus, class params>
+__host__ void host_integer_mult_radix_kb(
+    cuda_stream_t *stream, uint64_t *radix_lwe_out, uint64_t *radix_lwe_left,
+    uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
+    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
+
+  auto glwe_dimension = mem_ptr->params.glwe_dimension;
+  auto polynomial_size = mem_ptr->params.polynomial_size;
+  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
+  auto message_modulus = mem_ptr->params.message_modulus;
+  auto carry_modulus = mem_ptr->params.carry_modulus;
+
+  int big_lwe_dimension = glwe_dimension * polynomial_size;
+  int big_lwe_size = big_lwe_dimension + 1;
+
+  // 'vector_result_lsb' contains blocks from all possible right shifts of
+  // radix_lwe_left, only nonzero blocks are kept
+  int lsb_vector_block_count = num_blocks * (num_blocks + 1) / 2;
+
+  // 'vector_result_msb' contains blocks from all possible shifts of
+  // radix_lwe_left except the last blocks of each shift. Only nonzero blocks
+  // are kept
+  int msb_vector_block_count = num_blocks * (num_blocks - 1) / 2;
+
+  // total number of blocks msb and lsb
+  int total_block_count = lsb_vector_block_count + msb_vector_block_count;
+
+  // buffer to keep all lsb and msb shifts
+  // for lsb all nonzero blocks of each right shifts are kept
+  // for 0 shift num_blocks blocks
+  // for 1 shift num_blocks - 1 blocks
+  // for num_blocks - 1 shift 1 block
+  // (num_blocks + 1) * num_blocks / 2 blocks
+  // for msb we don't keep track for last blocks so
+  // for 0 shift num_blocks - 1 blocks
+  // for 1 shift num_blocks - 2 blocks
+  // for num_blocks - 1 shift  0 blocks
+  // (num_blocks - 1) * num_blocks / 2 blocks
+  // in total num_blocks^2 blocks
+  // in each block three is big polynomial with
+  // glwe_dimension * polynomial_size + 1 coefficients
+  auto vector_result_sb = mem_ptr->vector_result_sb;
+
+  // buffer to keep lsb_vector + msb_vector
+  // addition will happen in full terms so there will be
+  // num_blocks terms and each term will have num_blocks block
+  // num_blocks^2 blocks in total
+  // and each blocks has big lwe ciphertext with
+  // glwe_dimension * polynomial_size + 1 coefficients
+  auto block_mul_res = mem_ptr->block_mul_res;
+
+  // buffer to keep keyswitch result of num_blocks^2 ciphertext
+  // in total it has num_blocks^2 small lwe ciphertexts with
+  // lwe_dimension +1 coefficients
+  auto small_lwe_vector = mem_ptr->small_lwe_vector;
+
+  // buffer to keep pbs result for num_blocks^2 lwe_ciphertext
+  // in total it has num_blocks^2 big lwe ciphertexts with
+  // glwe_dimension * polynomial_size + 1 coefficients
+  auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;
+
+  // it contains two lut, first for lsb extraction,
+  // second for msb extraction, with total length =
+  // 2 * (glwe_dimension + 1) * polynomial_size
+  auto luts_array = mem_ptr->luts_array;
+
+  // accumulator to extract message
+  // with length (glwe_dimension + 1) * polynomial_size
+  auto luts_message = mem_ptr->luts_message;
+
+  // accumulator to extract carry
+  // with length (glwe_dimension + 1) * polynomial_size
+  auto luts_carry = mem_ptr->luts_carry;
+
+  // to be used as default indexing
+  auto lwe_indexes = luts_array->lwe_indexes;
+
+  auto vector_result_lsb = &vector_result_sb[0];
+  auto vector_result_msb =
+      &vector_result_sb[lsb_vector_block_count *
+                        (polynomial_size * glwe_dimension + 1)];
+
+  auto vector_lsb_rhs = &block_mul_res[0];
+  auto vector_msb_rhs = &block_mul_res[lsb_vector_block_count *
+                                       (polynomial_size * glwe_dimension + 1)];
+
+  dim3 grid(lsb_vector_block_count, 1, 1);
+  dim3 thds(params::degree / params::opt, 1, 1);
+
+  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, stream->stream>>>(
+      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
+      vector_lsb_rhs, vector_msb_rhs, num_blocks);
+
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
+      total_block_count, luts_array);
+
+  vector_result_lsb = &block_mul_res[0];
+  vector_result_msb = &block_mul_res[lsb_vector_block_count *
+                                     (polynomial_size * glwe_dimension + 1)];
+
+  fill_radix_from_lsb_msb<Torus, params>
+      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
+         stream->stream>>>(vector_result_sb, vector_result_lsb,
+                           vector_result_msb, glwe_dimension,
+                           lsb_vector_block_count, msb_vector_block_count,
+                           num_blocks);
+
+  auto new_blocks = block_mul_res;
+  auto old_blocks = vector_result_sb;
+
+  // amount of current radixes after block_mul
+  size_t r = 2 * num_blocks;
+
+  size_t total_modulus = message_modulus * carry_modulus;
+  size_t message_max = message_modulus - 1;
+  size_t chunk_size = (total_modulus - 1) / message_max;
+  size_t ch_amount = r / chunk_size;
+
+  int terms_degree[r * num_blocks];
+  int f_b[ch_amount];
+  int l_b[ch_amount];
+
+  for (int i = 0; i < num_blocks * num_blocks; i++) {
+    size_t r_id = i / num_blocks;
+    size_t b_id = i % num_blocks;
+    terms_degree[i] = (b_id >= r_id) ? 3 : 0;
+  }
+  auto terms_degree_msb = &terms_degree[num_blocks * num_blocks];
+  for (int i = 0; i < num_blocks * num_blocks; i++) {
+    size_t r_id = i / num_blocks;
+    size_t b_id = i % num_blocks;
+    terms_degree_msb[i] = (b_id > r_id) ? 2 : 0;
+  }
+
+  auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
+  while (r > chunk_size) {
+    int cur_total_blocks = r * num_blocks;
+    ch_amount = r / chunk_size;
+    dim3 add_grid(ch_amount, num_blocks, 1);
+    size_t sm_size = big_lwe_size * sizeof(Torus);
+    cuda_memset_async(new_blocks, 0,
+                      ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
+                      stream);
+
+    tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
+        new_blocks, old_blocks, chunk_size, num_blocks);
+
+    for (int c_id = 0; c_id < ch_amount; c_id++) {
+      auto cur_chunk = &terms_degree[c_id * chunk_size * num_blocks];
+      int mx = 0;
+      int mn = num_blocks;
+      for (int r_id = 1; r_id < chunk_size; r_id++) {
+        auto cur_radix = &cur_chunk[r_id * num_blocks];
+        for (int i = 0; i < num_blocks; i++) {
+          if (cur_radix[i]) {
+            mn = min(mn, i);
+            mx = max(mx, i);
+          }
+        }
+      }
+      f_b[c_id] = mn;
+      l_b[c_id] = mx;
+    }
+
+    int total_copied = 0;
+    int message_count = 0;
+    int carry_count = 0;
+    compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
+                                          l_b, num_blocks, ch_amount,
+                                          big_lwe_size, total_copied, true);
+
+    message_count = total_copied;
+    compress_device_array_with_map<Torus>(stream, new_blocks, old_blocks, f_b,
+                                          l_b, num_blocks, ch_amount,
+                                          big_lwe_size, total_copied, false);
+    carry_count = total_copied - message_count;
+
+    auto message_blocks_vector = old_blocks;
+    auto carry_blocks_vector =
+        &old_blocks[message_count * (glwe_dimension * polynomial_size + 1)];
+
+    cuda_keyswitch_lwe_ciphertext_vector(
+        stream, small_lwe_vector, lwe_indexes, old_blocks, lwe_indexes, ksk,
+        polynomial_size * glwe_dimension, lwe_dimension,
+        mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
+
+    execute_pbs<Torus>(
+        stream, message_blocks_vector, lwe_indexes, luts_message->lut,
+        luts_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
+        luts_message->pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, mem_ptr->params.pbs_base_log,
+        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+        message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
+
+    execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
+                       luts_carry->lut, luts_carry->lut_indexes,
+                       &small_lwe_vector[message_count * (lwe_dimension + 1)],
+                       lwe_indexes, bsk, luts_carry->pbs_buffer,
+                       glwe_dimension, lwe_dimension, polynomial_size,
+                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+                       mem_ptr->params.grouping_factor, carry_count, 1, 0,
+                       max_shared_memory, mem_ptr->params.pbs_type);
+
+    int rem_blocks = r % chunk_size * num_blocks;
+    int new_blocks_created = 2 * ch_amount * num_blocks;
+    int copy_size = rem_blocks * big_lwe_size * sizeof(Torus);
+
+    auto cur_dst = &new_blocks[new_blocks_created * big_lwe_size];
+    auto cur_src = &old_blocks[(cur_total_blocks - rem_blocks) * big_lwe_size];
+    cuda_memcpy_async_gpu_to_gpu(cur_dst, cur_src, copy_size, stream);
+
+    total_copied = 0;
+    int total_radix_copied = 0;
+    extract_message_carry_to_full_radix<Torus>(
+        stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
+        total_copied, total_radix_copied, num_blocks, true);
+    extract_message_carry_to_full_radix<Torus>(
+        stream, old_blocks, new_blocks, f_b, l_b, ch_amount, big_lwe_size,
+        total_copied, total_radix_copied, num_blocks, false);
+
+    std::swap(new_blocks, old_blocks);
+    r = (new_blocks_created + rem_blocks) / num_blocks;
+  }
+
+  dim3 add_grid(1, num_blocks, 1);
+  size_t sm_size = big_lwe_size * sizeof(Torus);
+  cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
+                    stream);
+  tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
+      radix_lwe_out, old_blocks, r, num_blocks);
+
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
+      luts_message);
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
+      luts_carry);
+
+  cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
+
+  host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
+                big_lwe_size, num_blocks);
+
+  host_propagate_single_carry_low_latency<Torus>(
+      stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
+    cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params,
+    bool allocate_gpu_memory) {
+  *mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
+                                       allocate_gpu_memory);
+}
+
+// Function to apply lookup table,
+// It has two mode
+//  lsb_msb_mode == true - extracts lsb and msb
+//  lsb_msb_mode == false - extracts message and carry
+template <typename Torus, typename STorus, class params>
+void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
+                        int_mul_memory<Torus> *mem_ptr, uint32_t glwe_dimension,
+                        uint32_t lwe_dimension, uint32_t polynomial_size,
+                        uint32_t pbs_base_log, uint32_t pbs_level,
+                        uint32_t ks_base_log, uint32_t ks_level,
+                        uint32_t grouping_factor,
+                        uint32_t lsb_message_blocks_count,
+                        uint32_t msb_carry_blocks_count,
+                        uint32_t max_shared_memory, bool lsb_msb_mode) {
+
+  int total_blocks_count = lsb_message_blocks_count + msb_carry_blocks_count;
+  int gpu_n = mem_ptr->p2p_gpu_count;
+  if (total_blocks_count < gpu_n)
+    gpu_n = total_blocks_count;
+  int gpu_blocks_count = total_blocks_count / gpu_n;
+  int big_lwe_size = glwe_dimension * polynomial_size + 1;
+  //  int small_lwe_size = lwe_dimension + 1;
+
+#pragma omp parallel for num_threads(gpu_n)
+  for (int i = 0; i < gpu_n; i++) {
+    cudaSetDevice(i);
+    auto this_stream = mem_ptr->streams[i];
+    // Index where input and output blocks start for current gpu
+    int big_lwe_start_index = i * gpu_blocks_count * big_lwe_size;
+
+    // Last gpu might have extra blocks to process if total blocks number is not
+    // divisible by gpu_n
+    if (i == gpu_n - 1) {
+      gpu_blocks_count += total_blocks_count % gpu_n;
+    }
+
+    int can_access_peer;
+    cudaDeviceCanAccessPeer(&can_access_peer, i, 0);
+    if (i == 0) {
+      check_cuda_error(
+          cudaMemcpyAsync(mem_ptr->pbs_output_multi_gpu[i],
+                          &input_ciphertexts[big_lwe_start_index],
+                          gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                          cudaMemcpyDeviceToDevice, *this_stream));
+    } else if (can_access_peer) {
+      check_cuda_error(cudaMemcpyPeerAsync(
+          mem_ptr->pbs_output_multi_gpu[i], i,
+          &input_ciphertexts[big_lwe_start_index], 0,
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
+    } else {
+      // Uses host memory as middle ground
+      cuda_memcpy_async_to_cpu(mem_ptr->device_to_device_buffer[i],
+                               &input_ciphertexts[big_lwe_start_index],
+                               gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                               this_stream, i);
+      cuda_memcpy_async_to_gpu(
+          mem_ptr->pbs_output_multi_gpu[i], mem_ptr->device_to_device_buffer[i],
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
+    }
+
+    // when lsb and msb have to be extracted
+    //  for first lsb_count blocks we need lsb_acc
+    //  for last msb_count blocks we need msb_acc
+    // when message and carry have tobe extracted
+    //  for first message_count blocks we need message_acc
+    //  for last carry_count blocks we need carry_acc
+    Torus *cur_lut_indexes;
+    if (lsb_msb_mode) {
+      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
+                    ? mem_ptr->lut_indexes_lsb_multi_gpu[i]
+                    : mem_ptr->lut_indexes_msb_multi_gpu[i];
+
+    } else {
+      cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
+                    ? mem_ptr->lut_indexes_message_multi_gpu[i]
+                    : mem_ptr->lut_indexes_carry_multi_gpu[i];
+    }
+
+    // execute keyswitch on a current gpu with corresponding input and output
+    // blocks pbs_output_multi_gpu[i] is an input for keyswitch and
+    // pbs_input_multi_gpu[i] is an output for keyswitch
+    cuda_keyswitch_lwe_ciphertext_vector(
+        this_stream, i, mem_ptr->pbs_input_multi_gpu[i],
+        mem_ptr->pbs_output_multi_gpu[i], mem_ptr->ksk_multi_gpu[i],
+        polynomial_size * glwe_dimension, lwe_dimension, ks_base_log, ks_level,
+        gpu_blocks_count);
+
+    // execute pbs on a current gpu with corresponding input and output
+    cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+        this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
+        mem_ptr->lut_multi_gpu[i], cur_lut_indexes,
+        mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
+        mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
+        polynomial_size, grouping_factor, pbs_base_log, pbs_level,
+        grouping_factor, gpu_blocks_count, 2, 0, max_shared_memory);
+
+    // lookup table is applied and now data from current gpu have to be copied
+    // back to gpu_0 in 'output_ciphertexts' buffer
+    if (i == 0) {
+      check_cuda_error(
+          cudaMemcpyAsync(&output_ciphertexts[big_lwe_start_index],
+                          mem_ptr->pbs_output_multi_gpu[i],
+                          gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                          cudaMemcpyDeviceToDevice, *this_stream));
+    } else if (can_access_peer) {
+      check_cuda_error(cudaMemcpyPeerAsync(
+          &output_ciphertexts[big_lwe_start_index], 0,
+          mem_ptr->pbs_output_multi_gpu[i], i,
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), *this_stream));
+    } else {
+      // Uses host memory as middle ground
+      cuda_memcpy_async_to_cpu(
+          mem_ptr->device_to_device_buffer[i], mem_ptr->pbs_output_multi_gpu[i],
+          gpu_blocks_count * big_lwe_size * sizeof(Torus), this_stream, i);
+      cuda_memcpy_async_to_gpu(&output_ciphertexts[big_lwe_start_index],
+                               mem_ptr->device_to_device_buffer[i],
+                               gpu_blocks_count * big_lwe_size * sizeof(Torus),
+                               this_stream, i);
+    }
+  }
+}
+
+template <typename T>
+__global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
+                                                         T *input_lwe_array,
+                                                         T scalar,
+                                                         uint32_t lwe_dimension,
+                                                         uint32_t num_blocks) {
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int lwe_size = lwe_dimension + 1;
+  if (index < num_blocks * lwe_size) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output_lwe_array[index] = input_lwe_array[index] * scalar;
+  }
+}
+
+template <typename T>
+__host__ void host_integer_small_scalar_mult_radix(
+    cuda_stream_t *stream, T *output_lwe_array, T *input_lwe_array, T scalar,
+    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  device_small_scalar_radix_multiplication<<<grid, thds, 0, stream->stream>>>(
+      output_lwe_array, input_lwe_array, scalar, input_lwe_dimension,
+      input_lwe_ciphertext_count);
+  check_cuda_error(cudaGetLastError());
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -0,0 +1,12 @@
+#include "integer/negation.cuh"
+
+void cuda_negate_integer_radix_ciphertext_64_inplace(
+    cuda_stream_t *stream, void *lwe_array, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {
+
+  host_integer_radix_negation(stream, static_cast<uint64_t *>(lwe_array),
+                              static_cast<uint64_t *>(lwe_array), lwe_dimension,
+                              lwe_ciphertext_count, message_modulus,
+                              carry_modulus);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -0,0 +1,79 @@
+#ifndef CUDA_INTEGER_NEGATE_CUH
+#define CUDA_INTEGER_NEGATE_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "device.h"
+#include "integer.h"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__global__ void
+device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
+                              uint64_t lwe_dimension, uint64_t message_modulus,
+                              uint64_t carry_modulus, uint64_t delta) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < lwe_dimension + 1) {
+    bool is_body = (tid == lwe_dimension);
+
+    // z = ceil( degree / 2^p ) * 2^p
+    uint64_t z = (2 * message_modulus - 1) / message_modulus;
+    __syncthreads();
+    z *= message_modulus;
+
+    // (0,Delta*z) - ct
+    output[tid] = (is_body ? z * delta - input[tid] : -input[tid]);
+
+    for (int radix_block_id = 1; radix_block_id < num_blocks;
+         radix_block_id++) {
+      tid += (lwe_dimension + 1);
+
+      // Subtract z/B to the next ciphertext to compensate for the addition of z
+      uint64_t zb = z / message_modulus;
+
+      uint64_t encoded_zb = zb * delta;
+
+      __syncthreads();
+
+      // (0,Delta*z) - ct
+      output[tid] =
+          (is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
+      __syncthreads();
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_negation(cuda_stream_t *stream, Torus *output,
+                                          Torus *input, uint32_t lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count,
+                                          uint64_t message_modulus,
+                                          uint64_t carry_modulus) {
+  cudaSetDevice(stream->gpu_index);
+
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+  uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
+
+  // Value of the shift we multiply our messages by
+  // If message_modulus and carry_modulus are always powers of 2 we can simplify
+  // this
+  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
+
+  device_integer_radix_negation<<<grid, thds, shared_mem, stream->stream>>>(
+      output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
+      carry_modulus, delta);
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -0,0 +1,12 @@
+#include "integer/scalar_addition.cuh"
+
+void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
+    cuda_stream_t *stream, void *lwe_array, void *scalar_input,
+    uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus) {
+
+  host_integer_radix_scalar_addition_inplace(
+      stream, static_cast<uint64_t *>(lwe_array),
+      static_cast<uint64_t *>(scalar_input), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -0,0 +1,130 @@
+#ifndef CUDA_INTEGER_ADD_CUH
+#define CUDA_INTEGER_ADD_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "device.h"
+#include "integer.h"
+#include "utils/kernel_dimensions.cuh"
+#include <stdio.h>
+
+template <typename Torus>
+__global__ void device_integer_radix_scalar_addition_inplace(
+    Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
+    uint32_t lwe_dimension, uint64_t delta) {
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < num_blocks) {
+    Torus scalar = scalar_input[tid];
+    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
+
+    *body += scalar * delta;
+  }
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_addition_inplace(
+    cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
+    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus) {
+  cudaSetDevice(stream->gpu_index);
+
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  // Value of the shift we multiply our messages by
+  // If message_modulus and carry_modulus are always powers of 2 we can simplify
+  // this
+  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
+
+  device_integer_radix_scalar_addition_inplace<<<grid, thds, 0,
+                                                 stream->stream>>>(
+      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
+      delta);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__global__ void device_integer_radix_add_scalar_one_inplace(
+    Torus *lwe_array, int32_t num_blocks, uint32_t lwe_dimension,
+    uint64_t delta) {
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < num_blocks) {
+    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
+    *body += delta;
+  }
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_add_scalar_one_inplace(
+    cuda_stream_t *stream, Torus *lwe_array, uint32_t lwe_dimension,
+    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {
+  cudaSetDevice(stream->gpu_index);
+
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  // Value of the shift we multiply our messages by
+  // If message_modulus and carry_modulus are always powers of 2 we can simplify
+  // this
+  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
+
+  device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0,
+                                                stream->stream>>>(
+      lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__global__ void device_integer_radix_scalar_subtraction_inplace(
+    Torus *lwe_array, Torus *scalar_input, int32_t num_blocks,
+    uint32_t lwe_dimension, uint64_t delta) {
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < num_blocks) {
+    Torus scalar = scalar_input[tid];
+    Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
+
+    *body -= scalar * delta;
+  }
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_subtraction_inplace(
+    cuda_stream_t *stream, Torus *lwe_array, Torus *scalar_input,
+    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
+    uint32_t message_modulus, uint32_t carry_modulus) {
+  cudaSetDevice(stream->gpu_index);
+
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  // Value of the shift we multiply our messages by
+  // If message_modulus and carry_modulus are always powers of 2 we can simplify
+  // this
+  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
+
+  device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
+                                                    stream->stream>>>(
+      lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
+      delta);
+  check_cuda_error(cudaGetLastError());
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -0,0 +1,14 @@
+#include "integer/scalar_bitops.cuh"
+
+void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_input,
+    void *clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr, void *bsk,
+    void *ksk, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {
+
+  host_integer_radix_scalar_bitop_kb<uint64_t>(
+      stream, static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_input),
+      static_cast<uint64_t *>(clear_blocks), num_clear_blocks,
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      lwe_ciphertext_count, op);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -0,0 +1,51 @@
+#ifndef CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
+#define CUDA_INTEGER_SCALAR_BITWISE_OPS_CUH
+
+#include "integer/bitwise_ops.cuh"
+#include <omp.h>
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_bitop_kb(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_input,
+    Torus *clear_blocks, uint32_t num_clear_blocks,
+    int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
+    uint32_t num_radix_blocks, BITOP_TYPE op) {
+
+  auto lut = mem_ptr->lut;
+  auto params = lut->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+
+  uint32_t lwe_size = big_lwe_dimension + 1;
+
+  if (num_clear_blocks == 0) {
+    if (op == SCALAR_BITAND) {
+      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
+      cuda_memset_async(lwe_array_out, 0,
+                        num_radix_blocks * lwe_size * sizeof(Torus), stream);
+    } else {
+      cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
+                                   num_radix_blocks * lwe_size * sizeof(Torus),
+                                   stream);
+    }
+  } else {
+    auto lut_buffer = lut->lut;
+    // We have all possible LUTs pre-computed and we use the decomposed scalar
+    // as index to recover the right one
+    cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
+                                 num_clear_blocks * sizeof(Torus), stream);
+
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
+        lut);
+
+    if (op == SCALAR_BITAND) {
+      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
+      cuda_memset_async(lwe_array_out_block, 0,
+                        (num_radix_blocks - num_clear_blocks) * lwe_size *
+                            sizeof(Torus),
+                        stream);
+    }
+  }
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -0,0 +1,44 @@
+#include "integer/scalar_comparison.cuh"
+
+void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *scalar_blocks, int8_t *mem_ptr, void *bsk, void *ksk,
+    uint32_t lwe_ciphertext_count, uint32_t num_scalar_blocks) {
+
+  int_comparison_buffer<uint64_t> *buffer =
+      (int_comparison_buffer<uint64_t> *)mem_ptr;
+  switch (buffer->op) {
+    //  case EQ:
+    //  case NE:
+    //    host_integer_radix_equality_check_kb<uint64_t>(
+    //        stream, static_cast<uint64_t *>(lwe_array_out),
+    //        static_cast<uint64_t *>(lwe_array_1),
+    //        static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
+    //        static_cast<uint64_t *>(ksk), glwe_dimension, polynomial_size,
+    //        big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log,
+    //        pbs_level, pbs_base_log, grouping_factor, lwe_ciphertext_count,
+    //        message_modulus, carry_modulus);
+    //    break;
+  case GT:
+  case GE:
+  case LT:
+  case LE:
+    host_integer_radix_scalar_difference_check_kb<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_array_in),
+        static_cast<uint64_t *>(scalar_blocks), buffer,
+        buffer->diff_buffer->operator_f, bsk, static_cast<uint64_t *>(ksk),
+        lwe_ciphertext_count, num_scalar_blocks);
+    break;
+  case MAX:
+  case MIN:
+    host_integer_radix_scalar_maxmin_kb<uint64_t>(
+        stream, static_cast<uint64_t *>(lwe_array_out),
+        static_cast<uint64_t *>(lwe_array_in),
+        static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
+        static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
+    break;
+  default:
+    printf("Not implemented\n");
+  }
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -0,0 +1,298 @@
+#ifndef CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
+#define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH
+
+#include "integer/comparison.cuh"
+#include <omp.h>
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_difference_check_kb(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
+    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
+    uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
+
+  auto params = mem_ptr->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  auto diff_buffer = mem_ptr->diff_buffer;
+
+  size_t big_lwe_size = big_lwe_dimension + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  // Reducing the signs is the bottleneck of the comparison algorithms,
+  // however if the scalar case there is an improvement:
+  //
+  // The idea is to reduce the number of signs block we have to
+  // reduce. We can do that by splitting the comparison problem in two parts.
+  //
+  // - One part where we compute the signs block between the scalar with just
+  // enough blocks
+  //   from the ciphertext that can represent the scalar value
+  //
+  // - The other part is to compare the ciphertext blocks not considered for the
+  // sign
+  //   computation with zero, and create a single sign block from that.
+  //
+  // The smaller the scalar value is compared to the ciphertext num bits
+  // encrypted, the more the comparisons with zeros we have to do, and the less
+  // signs block we will have to reduce.
+  //
+  // This will create a speedup as comparing a bunch of blocks with 0
+  // is faster
+  if (total_num_scalar_blocks == 0) {
+    // We only have to compare blocks with zero
+    // means scalar is zero
+    host_compare_with_zero_equality(stream, mem_ptr->tmp_lwe_array_out,
+                                    lwe_array_in, mem_ptr, bsk, ksk,
+                                    total_num_radix_blocks);
+
+    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
+      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
+
+      return sign_handler_f(x);
+    };
+
+    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
+    generate_device_accumulator<Torus>(stream, lut->lut, glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, scalar_last_leaf_lut_f);
+
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsk, ksk, 1, lut);
+
+    // The result will be in the two first block. Everything else is
+    //  garbage.
+    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                      big_lwe_size_bytes * (total_num_radix_blocks - 1),
+                      stream);
+
+  } else if (total_num_scalar_blocks < total_num_radix_blocks) {
+    // We have to handle both part of the work described above
+
+    uint32_t num_lsb_radix_blocks = total_num_scalar_blocks;
+    uint32_t num_msb_radix_blocks =
+        total_num_radix_blocks - num_lsb_radix_blocks;
+
+    auto lsb = lwe_array_in;
+    auto msb = lwe_array_in + num_lsb_radix_blocks * big_lwe_size;
+
+    auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
+    auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
+
+    cuda_synchronize_stream(stream);
+    auto lsb_stream = diff_buffer->lsb_stream;
+    auto msb_stream = diff_buffer->msb_stream;
+
+#pragma omp parallel sections
+    {
+      // Both sections may be executed in parallel
+#pragma omp section
+      {
+        //////////////
+        // lsb
+        Torus *lhs = diff_buffer->tmp_packed_left;
+        Torus *rhs = diff_buffer->tmp_packed_right;
+
+        pack_blocks(lsb_stream, lhs, lwe_array_in, big_lwe_dimension,
+                    num_lsb_radix_blocks, message_modulus);
+        pack_blocks(lsb_stream, rhs, scalar_blocks, 0, total_num_scalar_blocks,
+                    message_modulus);
+
+        // From this point we have half number of blocks
+        num_lsb_radix_blocks /= 2;
+        num_lsb_radix_blocks += (total_num_scalar_blocks % 2);
+
+        // comparisons will be assigned
+        // - 0 if lhs < rhs
+        // - 1 if lhs == rhs
+        // - 2 if lhs > rhs
+
+        auto comparisons = mem_ptr->tmp_block_comparisons;
+        scalar_compare_radix_blocks_kb(lsb_stream, comparisons, lhs, rhs,
+                                       mem_ptr, bsk, ksk, num_lsb_radix_blocks);
+
+        // Reduces a vec containing radix blocks that encrypts a sign
+        // (inferior, equal, superior) to one single radix block containing the
+        // final sign
+        tree_sign_reduction(lsb_stream, lwe_array_lsb_out, comparisons,
+                            mem_ptr->diff_buffer->tree_buffer,
+                            mem_ptr->cleaning_lut_f, bsk, ksk,
+                            num_lsb_radix_blocks);
+      }
+#pragma omp section
+      {
+        //////////////
+        // msb
+        host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
+                                        mem_ptr, bsk, ksk,
+                                        num_msb_radix_blocks);
+      }
+    }
+    cuda_synchronize_stream(lsb_stream);
+    cuda_synchronize_stream(msb_stream);
+
+    //////////////
+    // Reduce the two blocks into one final
+
+    auto scalar_bivariate_last_leaf_lut_f =
+        [sign_handler_f](Torus lsb, Torus msb) -> Torus {
+      if (msb == 1)
+        return sign_handler_f(lsb);
+      else
+        return sign_handler_f(IS_SUPERIOR);
+    };
+
+    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
+    generate_device_accumulator_bivariate<Torus>(
+        stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
+        carry_modulus, scalar_bivariate_last_leaf_lut_f);
+
+    integer_radix_apply_bivariate_lookup_table_kb(
+        stream, lwe_array_out, lwe_array_lsb_out, lwe_array_msb_out, bsk, ksk,
+        1, lut);
+
+    // The result will be in the first block. Everything else is garbage.
+    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
+                      stream);
+  } else {
+    // We only have to do the regular comparison
+    // And not the part where we compare most significant blocks with zeros
+    // total_num_radix_blocks == total_num_scalar_blocks
+    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
+    uint32_t num_scalar_blocks = total_num_scalar_blocks;
+
+    auto lsb = lwe_array_in;
+
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;
+
+    pack_blocks(stream, lhs, lwe_array_in, big_lwe_dimension,
+                num_lsb_radix_blocks, message_modulus);
+    pack_blocks(stream, rhs, scalar_blocks, 0, num_scalar_blocks,
+                message_modulus);
+
+    // From this point we have half number of blocks
+    num_lsb_radix_blocks /= 2;
+    num_scalar_blocks /= 2;
+
+    // comparisons will be assigned
+    // - 0 if lhs < rhs
+    // - 1 if lhs == rhs
+    // - 2 if lhs > rhs
+    auto comparisons = mem_ptr->tmp_lwe_array_out;
+    scalar_compare_radix_blocks_kb(stream, comparisons, lhs, rhs, mem_ptr, bsk,
+                                   ksk, num_lsb_radix_blocks);
+
+    // Reduces a vec containing radix blocks that encrypts a sign
+    // (inferior, equal, superior) to one single radix block containing the
+    // final sign
+    tree_sign_reduction(stream, lwe_array_out, comparisons,
+                        mem_ptr->diff_buffer->tree_buffer, sign_handler_f, bsk,
+                        ksk, num_lsb_radix_blocks);
+
+    // The result will be in the first block. Everything else is garbage.
+    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
+                      stream);
+  }
+}
+
+template <typename Torus>
+__host__ void
+scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
+                               Torus *lwe_array_in, Torus *scalar_blocks,
+                               int_comparison_buffer<Torus> *mem_ptr, void *bsk,
+                               Torus *ksk, uint32_t num_radix_blocks) {
+
+  auto params = mem_ptr->params;
+  auto pbs_type = params.pbs_type;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
+
+  // When rhs > lhs, the subtraction will overflow, and the bit of padding will
+  // be set to 1
+  // meaning that the output of the pbs will be the negative (modulo message
+  // space)
+  //
+  // Example:
+  // lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
+  // lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
+  // Since there was an overflow the bit of padding is 1 and not 0.
+  // When applying the LUT for an input value of 14 we would expect 1,
+  // but since the bit of padding is 1, we will get -1 modulus our message
+  // space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
+
+  auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
+  cuda_memcpy_async_gpu_to_gpu(
+      subtracted_blocks, lwe_array_in,
+      num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
+  // Subtract
+  // Here we need the true lwe sub, not the one that comes from shortint.
+  host_integer_radix_scalar_subtraction_inplace(
+      stream, subtracted_blocks, scalar_blocks, big_lwe_dimension,
+      num_radix_blocks, message_modulus, carry_modulus);
+
+  // Apply LUT to compare to 0
+  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
+  integer_radix_apply_univariate_lookup_table_kb(stream, lwe_array_out,
+                                                 subtracted_blocks, bsk, ksk,
+                                                 num_radix_blocks, sign_lut);
+
+  // Add one
+  // Here Lhs can have the following values: (-1) % (message modulus * carry
+  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
+  host_integer_radix_add_scalar_one_inplace(stream, lwe_array_out,
+                                            big_lwe_dimension, num_radix_blocks,
+                                            message_modulus, carry_modulus);
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_maxmin_kb(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
+    Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
+    Torus *ksk, uint32_t total_num_radix_blocks,
+    uint32_t total_num_scalar_blocks) {
+
+  auto params = mem_ptr->params;
+
+  // Calculates the difference sign between the ciphertext and the scalar
+  // - 0 if lhs < rhs
+  // - 1 if lhs == rhs
+  // - 2 if lhs > rhs
+  auto sign = mem_ptr->tmp_lwe_array_out;
+  host_integer_radix_scalar_difference_check_kb(
+      stream, sign, lwe_array_in, scalar_blocks, mem_ptr,
+      mem_ptr->cleaning_lut_f, bsk, ksk, total_num_radix_blocks,
+      total_num_scalar_blocks);
+
+  // There is no optimized CMUX for scalars, so we convert to a trivial
+  // ciphertext
+  auto lwe_array_left = lwe_array_in;
+  auto lwe_array_right = mem_ptr->tmp_block_comparisons;
+
+  create_trivial_radix(stream, lwe_array_right, scalar_blocks,
+                       params.big_lwe_dimension, total_num_radix_blocks,
+                       total_num_scalar_blocks, params.message_modulus,
+                       params.carry_modulus);
+
+  // Selector
+  // CMUX for Max or Min
+  host_integer_radix_cmux_kb(
+      stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
+      lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -0,0 +1,40 @@
+#include "scalar_rotate.cuh"
+
+void scratch_cuda_integer_radix_scalar_rotate_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_radix_scalar_rotate_kb<uint64_t>(
+      stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      shift_type, allocate_gpu_memory);
+}
+
+void cuda_integer_radix_scalar_rotate_kb_64_inplace(cuda_stream_t *stream,
+                                                    void *lwe_array, uint32_t n,
+                                                    int8_t *mem_ptr, void *bsk,
+                                                    void *ksk,
+                                                    uint32_t num_blocks) {
+
+  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
+      stream, static_cast<uint64_t *>(lwe_array), n,
+      (int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      num_blocks);
+}
+
+void cleanup_cuda_integer_radix_scalar_rotate(cuda_stream_t *stream,
+                                              int8_t **mem_ptr_void) {
+
+  int_shift_buffer<uint64_t> *mem_ptr =
+      (int_shift_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -0,0 +1,114 @@
+#ifndef CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
+#define CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.cuh"
+#include "integer.h"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "types/complex/operations.cuh"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
+#define CUDA_INTEGER_SHIFT_OPS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.cuh"
+#include "integer.h"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "types/complex/operations.cuh"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
+    cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
+                                         num_radix_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_rotate_kb_inplace(
+    cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
+    int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
+
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+
+  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  size_t num_bits_in_message = (size_t)log2(message_modulus);
+  size_t total_num_bits = num_bits_in_message * num_blocks;
+  n = n % total_num_bits;
+
+  if (n == 0) {
+    return;
+  }
+  size_t rotations = n / num_bits_in_message;
+  size_t shift_within_block = n % num_bits_in_message;
+
+  Torus *rotated_buffer = mem->tmp_rotated;
+
+  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+
+  // rotate right all the blocks in radix ciphertext
+  // copy result in new buffer
+  // 256 threads are used in every block
+  // block_count blocks will be used in the grid
+  // one block is responsible to process single lwe ciphertext
+  if (mem->shift_type == LEFT_SHIFT) {
+    radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
+        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+
+    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
+                                 num_blocks * big_lwe_size_bytes, stream);
+
+    if (shift_within_block == 0) {
+      return;
+    }
+
+    auto receiver_blocks = lwe_array;
+    auto giver_blocks = rotated_buffer;
+    radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
+        giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
+
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
+        lut_bivariate);
+
+  } else {
+    // left shift
+    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
+        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+
+    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
+                                 num_blocks * big_lwe_size_bytes, stream);
+
+    if (shift_within_block == 0) {
+      return;
+    }
+
+    auto receiver_blocks = lwe_array;
+    auto giver_blocks = rotated_buffer;
+    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
+        giver_blocks, lwe_array, 1, num_blocks, big_lwe_size);
+
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        stream, lwe_array, receiver_blocks, giver_blocks, bsk, ksk, num_blocks,
+        lut_bivariate);
+  }
+}
+
+#endif // CUDA_SCALAR_OPS_CUH
+
+#endif // CUDA_INTEGER_SCALAR_ROTATE_OPS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -0,0 +1,38 @@
+#include "scalar_shifts.cuh"
+
+void scratch_cuda_integer_radix_scalar_shift_kb_64(
+    cuda_stream_t *stream, int8_t **mem_ptr, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t big_lwe_dimension,
+    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, SHIFT_TYPE shift_type, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_radix_scalar_shift_kb<uint64_t>(
+      stream, (int_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
+      shift_type, allocate_gpu_memory);
+}
+
+void cuda_integer_radix_scalar_shift_kb_64_inplace(
+    cuda_stream_t *stream, void *lwe_array, uint32_t shift, int8_t *mem_ptr,
+    void *bsk, void *ksk, uint32_t num_blocks) {
+
+  host_integer_radix_scalar_shift_kb_inplace<uint64_t>(
+      stream, static_cast<uint64_t *>(lwe_array), shift,
+      (int_shift_buffer<uint64_t> *)mem_ptr, bsk, static_cast<uint64_t *>(ksk),
+      num_blocks);
+}
+
+void cleanup_cuda_integer_radix_scalar_shift(cuda_stream_t *stream,
+                                             int8_t **mem_ptr_void) {
+
+  int_shift_buffer<uint64_t> *mem_ptr =
+      (int_shift_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release(stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -0,0 +1,125 @@
+#ifndef CUDA_INTEGER_SHIFT_OPS_CUH
+#define CUDA_INTEGER_SHIFT_OPS_CUH
+
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.cuh"
+#include "integer.h"
+#include "pbs/bootstrap_low_latency.cuh"
+#include "pbs/bootstrap_multibit.cuh"
+#include "types/complex/operations.cuh"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_radix_scalar_shift_kb(
+    cuda_stream_t *stream, int_shift_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
+                                         num_radix_blocks, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_shift_kb_inplace(
+    cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
+    int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
+
+  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+
+  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
+  size_t num_bits_in_block = (size_t)log2(message_modulus);
+  size_t total_num_bits = num_bits_in_block * num_blocks;
+  shift = shift % total_num_bits;
+
+  if (shift == 0) {
+    return;
+  }
+  size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
+  size_t shift_within_block = shift % num_bits_in_block;
+
+  Torus *rotated_buffer = mem->tmp_rotated;
+
+  auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
+  auto lut_univariate = mem->lut_buffers_univariate[shift_within_block];
+
+  // rotate right all the blocks in radix ciphertext
+  // copy result in new buffer
+  // 256 threads are used in every block
+  // block_count blocks will be used in the grid
+  // one block is responsible to process single lwe ciphertext
+  if (mem->shift_type == LEFT_SHIFT) {
+    radix_blocks_rotate_right<<<num_blocks, 256, 0, stream->stream>>>(
+        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+
+    // create trivial assign for value = 0
+    cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
+                      stream);
+    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
+                                 num_blocks * big_lwe_size_bytes, stream);
+
+    if (shift_within_block == 0 || rotations == num_blocks) {
+      return;
+    }
+
+    // check if we have enough blocks for partial processing
+    if (rotations < num_blocks - 1) {
+      auto partial_current_blocks = &lwe_array[(rotations + 1) * big_lwe_size];
+      auto partial_previous_blocks = &lwe_array[rotations * big_lwe_size];
+
+      size_t partial_block_count = num_blocks - rotations - 1;
+
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          stream, partial_current_blocks, partial_current_blocks,
+          partial_previous_blocks, bsk, ksk, partial_block_count,
+          lut_bivariate);
+    }
+
+    auto rest = &lwe_array[rotations * big_lwe_size];
+
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, rest, rest, bsk, ksk, 1, lut_univariate);
+
+  } else {
+    // right shift
+    radix_blocks_rotate_left<<<num_blocks, 256, 0, stream->stream>>>(
+        rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
+
+    // rotate left as the blocks are from LSB to MSB
+    // create trivial assign for value = 0
+    cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size,
+                      0, rotations * big_lwe_size_bytes, stream);
+    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
+                                 num_blocks * big_lwe_size_bytes, stream);
+
+    if (shift_within_block == 0 || rotations == num_blocks) {
+      return;
+    }
+
+    // check if we have enough blocks for partial processing
+    if (rotations < num_blocks - 1) {
+      auto partial_current_blocks = lwe_array;
+      auto partial_next_blocks = &lwe_array[big_lwe_size];
+
+      size_t partial_block_count = num_blocks - rotations - 1;
+
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          stream, partial_current_blocks, partial_current_blocks,
+          partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
+    }
+
+    // The right-most block is done separately as it does not
+    // need to recuperate the shifted bits from its right neighbour.
+    auto last_block = &lwe_array[(num_blocks - rotations - 1) * big_lwe_size];
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        stream, last_block, last_block, bsk, ksk, 1, lut_univariate);
+  }
+}
+
+#endif // CUDA_SCALAR_OPS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shifts.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -0,0 +1,109 @@
+#include "linearalgebra/addition.cuh"
+
+/*
+ * Perform the addition of two u32 input LWE ciphertext vectors.
+ * See the equivalent operation on u64 ciphertexts for more details.
+ */
+void cuda_add_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count) {
+
+  host_addition(stream, static_cast<uint32_t *>(lwe_array_out),
+                static_cast<uint32_t *>(lwe_array_in_1),
+                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
+
+/*
+ * Perform the addition of two u64 input LWE ciphertext vectors.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
+ * should have been allocated and initialized before calling this function. It
+ * has the same size as the output array.
+ * - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
+ * should have been allocated and initialized before calling this function. It
+ * has the same size as the output array.
+ * - `input_lwe_dimension` is the number of mask elements in the two input and
+ * in the output ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
+ * input LWE ciphertext vector, as well as in the output.
+ *
+ * Each element (mask element or body) of the input LWE ciphertext vector 1 is
+ * added to the corresponding element in the input LWE ciphertext 2. The result
+ * is stored in the output LWE ciphertext vector. The two input LWE ciphertext
+ * vectors are left unchanged. This function is a wrapper to a device function
+ * that performs the operation on the GPU.
+ */
+void cuda_add_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+                                       void *lwe_array_out,
+                                       void *lwe_array_in_1,
+                                       void *lwe_array_in_2,
+                                       uint32_t input_lwe_dimension,
+                                       uint32_t input_lwe_ciphertext_count) {
+
+  host_addition(stream, static_cast<uint64_t *>(lwe_array_out),
+                static_cast<uint64_t *>(lwe_array_in_1),
+                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
+/*
+ * Perform the addition of a u32 input LWE ciphertext vector with a u32
+ * plaintext vector. See the equivalent operation on u64 data for more details.
+ */
+void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count) {
+
+  host_addition_plaintext(stream, static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<uint32_t *>(lwe_array_in),
+                          static_cast<uint32_t *>(plaintext_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
+}
+/*
+ * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext vector.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_array_in` is the plaintext vector used as input, it should have
+ * been allocated and initialized before calling this function. It should be of
+ * size `input_lwe_ciphertext_count`.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output. It is also the number
+ * of plaintexts in the input plaintext vector.
+ *
+ * Each plaintext of the input plaintext vector is added to the body of the
+ * corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
+void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count) {
+
+  host_addition_plaintext(stream, static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<uint64_t *>(lwe_array_in),
+                          static_cast<uint64_t *>(plaintext_array_in),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -0,0 +1,154 @@
+#ifndef CUDA_ADD_CUH
+#define CUDA_ADD_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "../utils/kernel_dimensions.cuh"
+#include "device.h"
+#include "linear_algebra.h"
+#include <stdio.h>
+
+template <typename T>
+__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
+                                   uint32_t input_lwe_dimension,
+                                   uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int plaintext_index = blockIdx.x * blockDim.x + tid;
+  if (plaintext_index < num_entries) {
+    int index =
+        plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] + plaintext_input[plaintext_index];
+  }
+}
+
+template <typename T>
+__host__ void host_addition_plaintext(cuda_stream_t *stream, T *output,
+                                      T *lwe_input, T *plaintext_input,
+                                      uint32_t lwe_dimension,
+                                      uint32_t lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cuda_memcpy_async_gpu_to_gpu(
+      output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count, stream);
+  plaintext_addition<<<grid, thds, 0, stream->stream>>>(
+      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void addition(T *output, T *input_1, T *input_2,
+                         uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = input_1[index] + input_2[index];
+  }
+}
+
+// Coefficient-wise addition
+template <typename T>
+__host__ void host_addition(cuda_stream_t *stream, T *output, T *input_1,
+                            T *input_2, uint32_t input_lwe_dimension,
+                            uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  addition<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
+                                              num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void subtraction(T *output, T *input_1, T *input_2,
+                            uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = input_1[index] - input_2[index];
+  }
+}
+
+// Coefficient-wise subtraction
+template <typename T>
+__host__ void host_subtraction(cuda_stream_t *stream, T *output, T *input_1,
+                               T *input_2, uint32_t input_lwe_dimension,
+                               uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  subtraction<<<grid, thds, 0, stream->stream>>>(output, input_1, input_2,
+                                                 num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
+                                               uint32_t input_lwe_dimension,
+                                               uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int plaintext_index = blockIdx.x * blockDim.x + tid;
+  if (plaintext_index < num_entries) {
+    int index =
+        plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
+    // Here we take advantage of the wrapping behaviour of uint
+    lwe_ct[index] -= plaintext_input[plaintext_index];
+  }
+}
+
+template <typename T>
+__host__ void host_subtraction_plaintext(cuda_stream_t *stream, T *output,
+                                         T *lwe_input, T *plaintext_input,
+                                         uint32_t input_lwe_dimension,
+                                         uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
+                               input_lwe_ciphertext_count *
+                                   (input_lwe_dimension + 1) * sizeof(T),
+                               stream);
+
+  radix_body_subtraction_inplace<<<grid, thds, 0, stream->stream>>>(
+      output, plaintext_input, input_lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+#endif // CUDA_ADD_H
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -0,0 +1,56 @@
+#include "linearalgebra/multiplication.cuh"
+
+/*
+ * Perform the multiplication of a u32 input LWE ciphertext vector with a u32
+ * cleartext vector. See the equivalent operation on u64 data for more details.
+ */
+void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *cleartext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count) {
+
+  host_cleartext_multiplication(stream, static_cast<uint32_t *>(lwe_array_out),
+                                static_cast<uint32_t *>(lwe_array_in),
+                                static_cast<uint32_t *>(cleartext_array_in),
+                                input_lwe_dimension,
+                                input_lwe_ciphertext_count);
+}
+/*
+ * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
+ * input cleartext vector.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `cleartext_array_in` is the cleartext vector used as input, it should have
+ * been allocated and initialized before calling this function. It should be of
+ * size `input_lwe_ciphertext_count`.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output. It is also the number
+ * of cleartexts in the input cleartext vector.
+ *
+ * Each cleartext of the input cleartext vector is multiplied to the mask and
+ * body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
+ * result of the operation is stored in the output LWE ciphertext vector. The
+ * two input vectors are unchanged. This function is a wrapper to a device
+ * function that performs the operation on the GPU.
+ */
+void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_array_in,
+    void *cleartext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count) {
+
+  host_cleartext_multiplication(stream, static_cast<uint64_t *>(lwe_array_out),
+                                static_cast<uint64_t *>(lwe_array_in),
+                                static_cast<uint64_t *>(cleartext_array_in),
+                                input_lwe_dimension,
+                                input_lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -0,0 +1,52 @@
+#ifndef CUDA_MULT_CUH
+#define CUDA_MULT_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "../utils/kernel_dimensions.cuh"
+#include "device.h"
+#include "linear_algebra.h"
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+template <typename T>
+__global__ void
+cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
+                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    int cleartext_index = index / (input_lwe_dimension + 1);
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] * cleartext_input[cleartext_index];
+  }
+}
+
+template <typename T>
+__host__ void
+host_cleartext_multiplication(cuda_stream_t *stream, T *output, T *lwe_input,
+                              T *cleartext_input, uint32_t input_lwe_dimension,
+                              uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cleartext_multiplication<<<grid, thds, 0, stream->stream>>>(
+      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif // CUDA_MULT_H
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cu
@@ -0,0 +1,49 @@
+#include "linearalgebra/negation.cuh"
+
+/*
+ * Perform the negation of a u32 input LWE ciphertext vector.
+ * See the equivalent operation on u64 ciphertexts for more details.
+ */
+void cuda_negate_lwe_ciphertext_vector_32(cuda_stream_t *stream,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count) {
+
+  host_negation(stream, static_cast<uint32_t *>(lwe_array_out),
+                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
+
+/*
+ * Perform the negation of a u64 input LWE ciphertext vector.
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `input_lwe_dimension` is the number of mask elements in the two input and
+ * in the output ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
+ * input LWE ciphertext vector, as well as in the output.
+ *
+ * Each element (mask element or body) of the input LWE ciphertext vector is
+ * negated. The result is stored in the output LWE ciphertext vector. The input
+ * LWE ciphertext vector is left unchanged. This function is a wrapper to a
+ * device function that performs the operation on the GPU.
+ */
+void cuda_negate_lwe_ciphertext_vector_64(cuda_stream_t *stream,
+                                          void *lwe_array_out,
+                                          void *lwe_array_in,
+                                          uint32_t input_lwe_dimension,
+                                          uint32_t input_lwe_ciphertext_count) {
+
+  host_negation(stream, static_cast<uint64_t *>(lwe_array_out),
+                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
+                input_lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -0,0 +1,44 @@
+#ifndef CUDA_NEGATE_CUH
+#define CUDA_NEGATE_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "../utils/kernel_dimensions.cuh"
+#include "device.h"
+#include "linear_algebra.h"
+
+template <typename T>
+__global__ void negation(T *output, T *input, uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = -input[index];
+  }
+}
+
+template <typename T>
+__host__ void host_negation(cuda_stream_t *stream, T *output, T *input,
+                            uint32_t input_lwe_dimension,
+                            uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(stream->gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  negation<<<grid, thds, 0, stream->stream>>>(output, input, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif // CUDA_NEGATE_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap.cu
@@ -0,0 +1 @@
+#include "bootstrapping_key.cuh"
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cu
@@ -0,0 +1,377 @@
+#include "bootstrap_amortized.cuh"
+
+/*
+ * Returns the buffer size for 64 bits executions
+ */
+uint64_t get_buffer_size_bootstrap_amortized_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+  return get_buffer_size_bootstrap_amortized<uint64_t>(
+      glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+      max_shared_memory);
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_bootstrap_amortized(int polynomial_size) {
+  assert(
+      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192, 16384",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192 ||
+           polynomial_size == 16384));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
+  assert(("Error (GPU amortized PBS): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_bootstrap_amortized(polynomial_size);
+}
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
+ * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
+ * be used.
+ */
+void scratch_cuda_bootstrap_amortized_32(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+  checks_fast_bootstrap_amortized(polynomial_size);
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<256>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 512:
+    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<512>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<1024>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<2048>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<4096>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 8192:
+    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<8192>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 16384:
+    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<16384>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
+ * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
+ * be used.
+ */
+void scratch_cuda_bootstrap_amortized_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+  checks_fast_bootstrap_amortized(polynomial_size);
+
+  switch (polynomial_size) {
+  case 256:
+    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<256>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 512:
+    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<512>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 1024:
+    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<1024>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 2048:
+    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<2048>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 4096:
+    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<4096>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 8192:
+    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<8192>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 16384:
+    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<16384>>(
+        stream, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/* Perform the programmable bootstrapping on a batch of input u32 LWE
+ * ciphertexts. See the corresponding operation on 64 bits for more details.
+ */
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
+
+  checks_bootstrap_amortized(32, base_log, polynomial_size);
+
+  switch (polynomial_size) {
+  case 256:
+    host_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
+        stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
+        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
+        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 512:
+    host_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
+        stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
+        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
+        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 1024:
+    host_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
+        stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
+        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
+        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 2048:
+    host_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
+        stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
+        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
+        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 4096:
+    host_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
+        stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
+        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
+        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 8192:
+    host_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
+        stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
+        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
+        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 16384:
+    host_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
+        stream, (uint32_t *)lwe_array_out, (uint32_t *)lwe_output_indexes,
+        (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
+        (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/* Perform the programmable bootstrapping on a batch of input u64 LWE
+ * ciphertexts. This functions performs best for large numbers of inputs (> 10).
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many luts of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which lut of lut_vector to use for each LWE input in
+ * lwe_array_in
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the lut
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * - input_lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - polynomial_size: size of the test polynomial (lut) and size of the
+ * GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
+ * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
+ * - level_count: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ * - num_luts: parameter to set the actual number of luts to be
+ * used
+ * - lwe_idx: the index of the LWE input to consider for the GPU of index
+ * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
+ * the input LWE array is copied to each GPU, but the whole LUT array is copied
+ * (because the case when the number of LUTs is smaller than the number of input
+ * LWEs is not trivial to take into account in the data repartition on the
+ * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
+ * input in the LUT array `lut_vector`.
+ *  - 'max_shared_memory' maximum amount of shared memory to be used inside
+ * device functions
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples blocks of threads are launched, where each thread is going
+ * to handle one or more polynomial coefficients at each stage:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- decompose into level_count levels, then for each level:
+ * 		  - switch to the FFT domain
+ * 		  - multiply with the bootstrapping key
+ * 		  - come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
+
+  checks_bootstrap_amortized(64, base_log, polynomial_size);
+
+  switch (polynomial_size) {
+  case 256:
+    host_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
+        stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
+        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
+        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 512:
+    host_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
+        stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
+        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
+        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 1024:
+    host_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
+        stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
+        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
+        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 2048:
+    host_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
+        stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
+        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
+        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 4096:
+    host_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
+        stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
+        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
+        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 8192:
+    host_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
+        stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
+        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
+        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  case 16384:
+    host_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
+        stream, (uint64_t *)lwe_array_out, (uint64_t *)lwe_output_indexes,
+        (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
+        (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_luts,
+        lwe_idx, max_shared_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/*
+ * This cleanup function frees the data for the amortized PBS on GPU in
+ * pbs_buffer for 32 or 64 bits inputs.
+ */
+void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
+                                      int8_t **pbs_buffer) {
+
+  // Free memory
+  cuda_drop_async(*pbs_buffer, stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_amortized.cuh
@@ -0,0 +1,363 @@
+#ifndef CUDA_AMORTIZED_PBS_CUH
+#define CUDA_AMORTIZED_PBS_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "bootstrap.h"
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+
+template <typename Torus, class params, sharedMemDegree SMD>
+/*
+ * Kernel launched by host_bootstrap_amortized
+ *
+ * Uses shared memory to increase performance
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many luts of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to which lut
+ * to use for each sample in lut_vector
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
+ * key sk2
+ *  - device_mem: pointer to the device's global memory in case we use it (SMD
+ * == NOSM or PARTIALSM)
+ *  - lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ *  - polynomial_size: size of the test polynomial (lut) and size of the
+ * GLWE polynomial (~1024)
+ *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ *  - level_count: number of decomposition levels in the gadget matrix (~4)
+ *  - gpu_num: index of the current GPU (useful for multi-GPU computations)
+ *  - lwe_idx: equal to the number of samples per gpu x gpu_num
+ *  - device_memory_size_per_sample: amount of global memory to allocate if SMD
+ * is not FULLSM
+ */
+__global__ void device_bootstrap_amortized(
+    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    double2 *bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t lwe_idx,
+    size_t device_memory_size_per_sample) {
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  if constexpr (SMD == FULLSM)
+    selected_memory = sharedmem;
+  else
+    selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
+
+  // For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
+  // one mask polynomial and 1 body to handle.
+  Torus *accumulator = (Torus *)selected_memory;
+  Torus *accumulator_rotated =
+      (Torus *)accumulator +
+      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
+  double2 *res_fft =
+      (double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
+                                           (sizeof(double2) / sizeof(Torus));
+  double2 *accumulator_fft = (double2 *)sharedmem;
+  if constexpr (SMD != PARTIALSM)
+    accumulator_fft = (double2 *)res_fft +
+                      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
+
+  auto block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+  Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
+                  (glwe_dimension + 1)];
+
+  // Put "b", the body, in [0, 2N[
+  Torus b_hat = 0;
+  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                        2 * params::degree); // 2 * params::log2_degree + 1);
+
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                        params::degree / params::opt>(
+      accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
+
+  // Loop over all the mask elements of the sample to accumulate
+  // (X^a_i-1) multiplication, decomposition of the resulting polynomial
+  // into level_count polynomials, and performing polynomial multiplication
+  // via an FFT with the RGSW encrypted secret key
+  for (int iteration = 0; iteration < lwe_dimension; iteration++) {
+    synchronize_threads_in_block();
+
+    // Put "a" in [0, 2N[ instead of Zq
+    Torus a_hat = 0;
+    rescale_torus_element(block_lwe_array_in[iteration], a_hat,
+                          2 * params::degree); // 2 * params::log2_degree + 1);
+
+    // Perform ACC * (X^ä - 1)
+    multiply_by_monomial_negacyclic_and_sub_polynomial<
+        Torus, params::opt, params::degree / params::opt>(
+        accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
+
+    synchronize_threads_in_block();
+
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    round_to_closest_multiple_inplace<Torus, params::opt,
+                                      params::degree / params::opt>(
+        accumulator_rotated, base_log, level_count, glwe_dimension + 1);
+
+    // Initialize the polynomial multiplication via FFT arrays
+    // The polynomial multiplications happens at the block level
+    // and each thread handles two or more coefficients
+    int pos = threadIdx.x;
+    for (int i = 0; i < (glwe_dimension + 1); i++)
+      for (int j = 0; j < params::opt / 2; j++) {
+        res_fft[pos].x = 0;
+        res_fft[pos].y = 0;
+        pos += params::degree / params::opt;
+      }
+
+    GadgetMatrix<Torus, params> gadget(base_log, level_count,
+                                       accumulator_rotated, glwe_dimension + 1);
+    // Now that the rotation is done, decompose the resulting polynomial
+    // coefficients so as to multiply each decomposed level with the
+    // corresponding part of the bootstrapping key
+    for (int level = level_count - 1; level >= 0; level--) {
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        gadget.decompose_and_compress_next_polynomial(accumulator_fft, i);
+
+        // Switch to the FFT space
+        NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+
+        // Get the bootstrapping key piece necessary for the multiplication
+        // It is already in the Fourier domain
+        auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
+                                                level, polynomial_size,
+                                                glwe_dimension, level_count);
+
+        // Perform the coefficient-wise product with the two pieces of
+        // bootstrapping key
+        for (int j = 0; j < (glwe_dimension + 1); j++) {
+          auto bsk_poly = bsk_slice + j * params::degree / 2;
+          auto res_fft_poly = res_fft + j * params::degree / 2;
+          polynomial_product_accumulate_in_fourier_domain<params, double2>(
+              res_fft_poly, accumulator_fft, bsk_poly);
+        }
+      }
+      synchronize_threads_in_block();
+    }
+
+    // Come back to the coefficient representation
+    if constexpr (SMD == FULLSM || SMD == NOSM) {
+      synchronize_threads_in_block();
+
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
+      }
+      synchronize_threads_in_block();
+
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
+      }
+      synchronize_threads_in_block();
+    } else {
+#pragma unroll
+      for (int i = 0; i < (glwe_dimension + 1); i++) {
+        auto accumulator_slice = accumulator + i * params::degree;
+        auto res_fft_slice = res_fft + i * params::degree / 2;
+        int tid = threadIdx.x;
+        for (int j = 0; j < params::opt / 2; j++) {
+          accumulator_fft[tid] = res_fft_slice[tid];
+          tid = tid + params::degree / params::opt;
+        }
+        synchronize_threads_in_block();
+
+        NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+        synchronize_threads_in_block();
+
+        add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
+      }
+      synchronize_threads_in_block();
+    }
+  }
+
+  auto block_lwe_array_out =
+      &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                     (glwe_dimension * polynomial_size + 1)];
+
+  // The blind rotation for this block is over
+  // Now we can perform the sample extraction: for the body it's just
+  // the resulting constant coefficient of the accumulator
+  // For the mask it's more complicated
+  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
+  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
+                                     glwe_dimension);
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized(
+    uint32_t polynomial_size, uint32_t glwe_dimension) {
+  return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
+         sizeof(Torus) * polynomial_size *
+             (glwe_dimension + 1) +              // accumulator rotated
+         sizeof(double2) * polynomial_size / 2 + // accumulator fft
+         sizeof(double2) * polynomial_size / 2 *
+             (glwe_dimension + 1); // res fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized(
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
+  uint64_t partial_dm = full_sm - partial_sm;
+  uint64_t full_dm = full_sm;
+  uint64_t device_mem = 0;
+  if (max_shared_memory < partial_sm) {
+    device_mem = full_dm * input_lwe_ciphertext_count;
+  } else if (max_shared_memory < full_sm) {
+    device_mem = partial_dm * input_lwe_ciphertext_count;
+  }
+  return device_mem + device_mem % sizeof(double2);
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_bootstrap_amortized(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
+    cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         partial_sm);
+    cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
+                           cudaFuncCachePreferShared);
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
+    check_cuda_error(cudaFuncSetCacheConfig(
+        device_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared));
+  }
+  if (allocate_gpu_memory) {
+    uint64_t buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
+        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
+        max_shared_memory);
+    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
+    check_cuda_error(cudaGetLastError());
+  }
+}
+
+template <typename Torus, class params>
+__host__ void host_bootstrap_amortized(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
+    uint32_t lwe_idx, uint32_t max_shared_memory) {
+
+  cudaSetDevice(stream->gpu_index);
+  uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
+      polynomial_size, glwe_dimension);
+
+  uint64_t SM_PART =
+      get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
+
+  uint64_t DM_PART = SM_FULL - SM_PART;
+
+  uint64_t DM_FULL = SM_FULL;
+
+  // Create a 1-dimensional grid of threads
+  // where each block handles 1 sample and each thread
+  // handles opt polynomial coefficients
+  // (actually opt/2 coefficients since we compress the real polynomial into a
+  // complex)
+  dim3 grid(input_lwe_ciphertext_count, 1, 1);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  // Launch the kernel using polynomial_size/opt threads
+  // where each thread computes opt polynomial coefficients
+  // Depending on the required amount of shared memory, choose
+  // from one of three templates (no use, partial use or full use
+  // of shared memory)
+  if (max_shared_memory < SM_PART) {
+    device_bootstrap_amortized<Torus, params, NOSM>
+        <<<grid, thds, 0, stream->stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
+            glwe_dimension, lwe_dimension, polynomial_size, base_log,
+            level_count, lwe_idx, DM_FULL);
+  } else if (max_shared_memory < SM_FULL) {
+    device_bootstrap_amortized<Torus, params, PARTIALSM>
+        <<<grid, thds, SM_PART, stream->stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
+            glwe_dimension, lwe_dimension, polynomial_size, base_log,
+            level_count, lwe_idx, DM_PART);
+  } else {
+    // For devices with compute capability 7.x a single thread block can
+    // address the full capacity of shared memory. Shared memory on the
+    // device then has to be allocated dynamically.
+    // For lower compute capabilities, this call
+    // just does nothing and the amount of shared memory used is 48 KB
+    device_bootstrap_amortized<Torus, params, FULLSM>
+        <<<grid, thds, SM_FULL, stream->stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
+            glwe_dimension, lwe_dimension, polynomial_size, base_log,
+            level_count, lwe_idx, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus, class params>
+int cuda_get_pbs_per_gpu(int polynomial_size) {
+
+  int blocks_per_sm = 0;
+  int num_threads = polynomial_size / params::opt;
+  cudaGetDeviceCount(0);
+  cudaDeviceProp device_properties;
+  cudaGetDeviceProperties(&device_properties, 0);
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, device_bootstrap_amortized<Torus, params>, num_threads,
+      0);
+
+  return device_properties.multiProcessorCount * blocks_per_sm;
+}
+
+#endif // CNCRT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_low_latency.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_low_latency.cuh
@@ -0,0 +1,453 @@
+#ifndef CUDA_FAST_LOWLAT_PBS_CUH
+#define CUDA_FAST_LOWLAT_PBS_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "cooperative_groups.h"
+
+#include "bootstrap.h"
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+
+// Cooperative groups are used in the low latency PBS
+using namespace cooperative_groups;
+namespace cg = cooperative_groups;
+
+template <typename Torus, class params>
+__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
+                              double2 *join_buffer, double2 *bootstrapping_key,
+                              int polynomial_size, uint32_t glwe_dimension,
+                              int level_count, int iteration,
+                              grid_group &grid) {
+
+  // Switch to the FFT space
+  NSMFFT_direct<HalfDegree<params>>(fft);
+  synchronize_threads_in_block();
+
+  // Get the pieces of the bootstrapping key that will be needed for the
+  // external product; blockIdx.x is the ID of the block that's executing
+  // this function, so we end up getting the lines of the bootstrapping key
+  // needed to perform the external product in this block (corresponding to
+  // the same decomposition level)
+  auto bsk_slice = get_ith_mask_kth_block(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
+      glwe_dimension, level_count);
+
+  // Selects all GLWEs in a particular decomposition level
+  auto level_join_buffer =
+      join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
+
+  // Perform the matrix multiplication between the GGSW and the GLWE,
+  // each block operating on a single level for mask and body
+
+  // The first product is used to initialize level_join_buffer
+  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+  auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
+
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
+    tid += params::degree / params::opt;
+  }
+
+  grid.sync();
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  for (int j = 1; j < (glwe_dimension + 1); j++) {
+    int idx = (j + blockIdx.y) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * params::degree / 2;
+    auto buffer_slice = level_join_buffer + idx * params::degree / 2;
+
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt / 2; i++) {
+      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
+      tid += params::degree / params::opt;
+    }
+    grid.sync();
+  }
+
+  // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block
+
+  auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
+
+  // copy first product into fft buffer
+  tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    fft[tid] = src_acc[tid];
+    tid += params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+
+  // accumulate rest of the products into fft buffer
+  for (int l = 1; l < gridDim.x; l++) {
+    auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt / 2; i++) {
+      fft[tid] += cur_src_acc[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+
+  synchronize_threads_in_block();
+
+  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
+  // accumulator
+  NSMFFT_inverse<HalfDegree<params>>(fft);
+  synchronize_threads_in_block();
+
+  add_to_torus<Torus, params>(fft, accumulator);
+
+  __syncthreads();
+}
+
+/*
+ * Kernel launched by the low latency version of the
+ * bootstrapping, that uses cooperative groups
+ *
+ * - lwe_array_out: vector of output lwe s, with length
+ * (glwe_dimension * polynomial_size+1)*num_samples
+ * - lut_vector: vector of look up tables with
+ * length  (glwe_dimension+1) * polynomial_size * num_samples
+ * - lut_vector_indexes: mapping between lwe_array_in and lut_vector
+ * lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
+ * num_samples
+ *
+ * Each y-block computes one element of the lwe_array_out.
+ */
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void device_bootstrap_fast_low_latency(
+    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    int8_t *device_mem, uint64_t device_memory_size_per_block) {
+
+  grid_group grid = this_grid();
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  // We always compute the pointer with most restrictive alignment to avoid
+  // alignment issues
+  double2 *accumulator_fft = (double2 *)selected_memory;
+  Torus *accumulator =
+      (Torus *)accumulator_fft +
+      (ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
+  Torus *accumulator_rotated =
+      (Torus *)accumulator + (ptrdiff_t)polynomial_size;
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = (double2 *)sharedmem;
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
+
+  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
+                                        params::degree * (glwe_dimension + 1)];
+
+  double2 *block_join_buffer =
+      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
+                   params::degree / 2];
+  // Since the space is L1 cache is small, we use the same memory location for
+  // the rotated accumulator and the fft accumulator, since we know that the
+  // rotated array is not in use anymore by the time we perform the fft
+
+  // Put "b" in [0, 2N[
+  Torus b_hat = 0;
+  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                        2 * params::degree);
+
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                        params::degree / params::opt>(
+      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+      false);
+
+  for (int i = 0; i < lwe_dimension; i++) {
+    synchronize_threads_in_block();
+
+    // Put "a" in [0, 2N[
+    Torus a_hat = 0;
+    rescale_torus_element(block_lwe_array_in[i], a_hat,
+                          2 * params::degree); // 2 * params::log2_degree + 1);
+
+    // Perform ACC * (X^ä - 1)
+    multiply_by_monomial_negacyclic_and_sub_polynomial<
+        Torus, params::opt, params::degree / params::opt>(
+        accumulator, accumulator_rotated, a_hat);
+
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    round_to_closest_multiple_inplace<Torus, params::opt,
+                                      params::degree / params::opt>(
+        accumulator_rotated, base_log, level_count);
+
+    synchronize_threads_in_block();
+
+    // Decompose the accumulator. Each block gets one level of the
+    // decomposition, for the mask and the body (so block 0 will have the
+    // accumulator decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
+                                           accumulator_rotated);
+    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
+
+    // We are using the same memory space for accumulator_fft and
+    // accumulator_rotated, so we need to synchronize here to make sure they
+    // don't modify the same memory space at the same time
+    synchronize_threads_in_block();
+
+    // Perform G^-1(ACC) * GGSW -> GLWE
+    mul_ggsw_glwe<Torus, params>(
+        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
+        polynomial_size, glwe_dimension, level_count, i, grid);
+
+    synchronize_threads_in_block();
+  }
+
+  auto block_lwe_array_out =
+      &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                         (glwe_dimension * polynomial_size + 1) +
+                     blockIdx.y * polynomial_size];
+
+  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
+    // Perform a sample extract. At this point, all blocks have the result, but
+    // we do the computation at block 0 to avoid waiting for extra blocks, in
+    // case they're not synchronized
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+  }
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_bootstrap_fast_low_latency(
+    uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
+      polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
+          polynomial_size);
+  uint64_t partial_dm = full_sm - partial_sm;
+  uint64_t full_dm = full_sm;
+  uint64_t device_mem = 0;
+  if (max_shared_memory < partial_sm) {
+    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
+  } else if (max_shared_memory < full_sm) {
+    device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
+  }
+  uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
+                                          input_lwe_ciphertext_count *
+                                          polynomial_size / 2 * sizeof(double2);
+  return buffer_size + buffer_size % sizeof(double2);
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_bootstrap_fast_low_latency(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
+      polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
+          polynomial_size);
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
+    cudaFuncSetCacheConfig(
+        device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+  if (allocate_gpu_memory) {
+    uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory);
+    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
+    check_cuda_error(cudaGetLastError());
+  }
+}
+
+/*
+ * Host wrapper to the low latency version
+ * of bootstrapping
+ */
+template <typename Torus, class params>
+__host__ void host_bootstrap_fast_low_latency(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
+    uint32_t max_shared_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  // With SM each block corresponds to either the mask or body, no need to
+  // duplicate data for each
+  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
+      polynomial_size);
+
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
+          polynomial_size);
+
+  uint64_t full_dm = full_sm;
+
+  uint64_t partial_dm = full_dm - partial_sm;
+
+  int8_t *d_mem = pbs_buffer;
+  double2 *buffer_fft =
+      (double2 *)d_mem +
+      (ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
+                      glwe_dimension, polynomial_size, level_count,
+                      input_lwe_ciphertext_count, max_shared_memory) /
+                      sizeof(double2) -
+                  (glwe_dimension + 1) * level_count *
+                      input_lwe_ciphertext_count * polynomial_size / 2);
+
+  int thds = polynomial_size / params::opt;
+  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
+
+  void *kernel_args[14];
+  kernel_args[0] = &lwe_array_out;
+  kernel_args[1] = &lwe_output_indexes;
+  kernel_args[2] = &lut_vector;
+  kernel_args[3] = &lut_vector_indexes;
+  kernel_args[4] = &lwe_array_in;
+  kernel_args[5] = &lwe_input_indexes;
+  kernel_args[6] = &bootstrapping_key;
+  kernel_args[7] = &buffer_fft;
+  kernel_args[8] = &lwe_dimension;
+  kernel_args[9] = &polynomial_size;
+  kernel_args[10] = &base_log;
+  kernel_args[11] = &level_count;
+  kernel_args[12] = &d_mem;
+
+  if (max_shared_memory < partial_sm) {
+    kernel_args[13] = &full_dm;
+    check_cuda_error(cudaLaunchCooperativeKernel(
+        (void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, grid,
+        thds, (void **)kernel_args, 0, stream->stream));
+  } else if (max_shared_memory < full_sm) {
+    kernel_args[13] = &partial_dm;
+    check_cuda_error(cudaLaunchCooperativeKernel(
+        (void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
+        grid, thds, (void **)kernel_args, partial_sm, stream->stream));
+  } else {
+    int no_dm = 0;
+    kernel_args[13] = &no_dm;
+    check_cuda_error(cudaLaunchCooperativeKernel(
+        (void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, grid,
+        thds, (void **)kernel_args, full_sm, stream->stream));
+  }
+
+  check_cuda_error(cudaGetLastError());
+}
+
+// Verify if the grid size for the low latency kernel satisfies the cooperative
+// group constraints
+template <typename Torus, class params>
+__host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
+    int glwe_dimension, int level_count, int num_samples,
+    uint32_t max_shared_memory) {
+
+  // If Cooperative Groups is not supported, no need to check anything else
+  if (!cuda_check_support_cooperative_groups())
+    return false;
+
+  // Calculate the dimension of the kernel
+  uint64_t full_sm =
+      get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(params::degree);
+
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
+          params::degree);
+
+  int thds = params::degree / params::opt;
+
+  // Get the maximum number of active blocks per streaming multiprocessors
+  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
+  int max_active_blocks_per_sm;
+
+  if (max_shared_memory < partial_sm) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, thds,
+        0);
+  } else if (max_shared_memory < full_sm) {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
+        thds, 0);
+  } else {
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_per_sm,
+        (void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
+        0);
+  }
+
+  // Get the number of streaming multiprocessors
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
+}
+
+#endif // LOWLAT_FAST_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_fast_multibit.cuh
@@ -0,0 +1,322 @@
+#ifndef CUDA_FAST_MULTIBIT_PBS_CUH
+#define CUDA_FAST_MULTIBIT_PBS_CUH
+
+#include "bootstrap.h"
+#include "bootstrap_multibit.cuh"
+#include "bootstrap_multibit.h"
+#include "cooperative_groups.h"
+#include "crypto/gadget.cuh"
+#include "crypto/ggsw.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+#include <vector>
+
+template <typename Torus, class params>
+__global__ void device_multi_bit_bootstrap_fast_accumulate(
+    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    double2 *keybundle_array, double2 *join_buffer, Torus *global_accumulator,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t grouping_factor,
+    uint32_t lwe_offset, uint32_t lwe_chunk_size,
+    uint32_t keybundle_size_per_input) {
+
+  grid_group grid = this_grid();
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  selected_memory = sharedmem;
+
+  // We always compute the pointer with most restrictive alignment to avoid
+  // alignment issues
+  double2 *accumulator_fft = (double2 *)selected_memory;
+  Torus *accumulator =
+      (Torus *)accumulator_fft +
+      (ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
+
+  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
+                                        params::degree * (glwe_dimension + 1)];
+
+  double2 *block_join_buffer =
+      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
+                   params::degree / 2];
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+
+  double2 *keybundle = keybundle_array +
+                       // select the input
+                       blockIdx.z * keybundle_size_per_input;
+
+  if (lwe_offset == 0) {
+    // Put "b" in [0, 2N[
+    Torus b_hat = 0;
+    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                          2 * params::degree);
+
+    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                          params::degree / params::opt>(
+        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+        false);
+  } else {
+    // Load the accumulator calculated in previous iterations
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        global_slice, accumulator);
+  }
+
+  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    round_to_closest_multiple_inplace<Torus, params::opt,
+                                      params::degree / params::opt>(
+        accumulator, base_log, level_count);
+
+    // Decompose the accumulator. Each block gets one level of the
+    // decomposition, for the mask and the body (so block 0 will have the
+    // accumulator decomposed at level 0, 1 at 1, etc.)
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
+
+    // We are using the same memory space for accumulator_fft and
+    // accumulator_rotated, so we need to synchronize here to make sure they
+    // don't modify the same memory space at the same time
+    synchronize_threads_in_block();
+
+    // Perform G^-1(ACC) * GGSW -> GLWE
+    mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
+                                 block_join_buffer, keybundle, polynomial_size,
+                                 glwe_dimension, level_count, i, grid);
+
+    synchronize_threads_in_block();
+  }
+
+  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
+    auto block_lwe_array_out =
+        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
+                           (glwe_dimension * polynomial_size + 1) +
+                       blockIdx.y * polynomial_size];
+
+    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    }
+  } else {
+    // Load the accumulator calculated in previous iterations
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        accumulator, global_slice);
+  }
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_fast_multibit_bootstrap(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size * 2; // accumulator
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    uint32_t grouping_factor, uint32_t lwe_chunk_size,
+    uint32_t max_shared_memory) {
+
+  uint64_t buffer_size = 0;
+  buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
+                 (glwe_dimension + 1) * (glwe_dimension + 1) *
+                 (polynomial_size / 2) * sizeof(double2); // keybundle fft
+  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
+                 level_count * (polynomial_size / 2) *
+                 sizeof(double2); // join buffer
+  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
+                 polynomial_size * sizeof(Torus); // global_accumulator
+
+  return buffer_size + buffer_size % sizeof(double2);
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_fast_multi_bit_pbs(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
+    uint32_t max_shared_memory, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size = 0) {
+
+  cudaSetDevice(stream->gpu_index);
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate =
+      get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
+
+  check_cuda_error(cudaFuncSetAttribute(
+      device_multi_bit_bootstrap_keybundle<Torus, params>,
+      cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+  cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
+                         cudaFuncCachePreferShared);
+  check_cuda_error(cudaGetLastError());
+
+  check_cuda_error(cudaFuncSetAttribute(
+      device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
+      cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_accumulate));
+  cudaFuncSetCacheConfig(
+      device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
+      cudaFuncCachePreferShared);
+  check_cuda_error(cudaGetLastError());
+
+  if (allocate_gpu_memory) {
+    if (!lwe_chunk_size)
+      lwe_chunk_size =
+          get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
+                                     input_lwe_ciphertext_count);
+
+    uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
+        lwe_dimension, glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
+        max_shared_memory);
+    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
+    check_cuda_error(cudaGetLastError());
+  }
+}
+
+template <typename Torus, typename STorus, class params>
+__host__ void host_fast_multi_bit_pbs(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
+  cudaSetDevice(stream->gpu_index);
+
+  if (!lwe_chunk_size)
+    lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
+                                                glwe_dimension, num_samples);
+
+  //
+  double2 *keybundle_fft = (double2 *)pbs_buffer;
+  double2 *buffer_fft = (double2 *)keybundle_fft +
+                        num_samples * lwe_chunk_size * level_count *
+                            (glwe_dimension + 1) * (glwe_dimension + 1) *
+                            (polynomial_size / 2);
+  Torus *global_accumulator =
+      (Torus *)buffer_fft +
+      (ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
+                  level_count * (polynomial_size / 2) / sizeof(Torus));
+
+  //
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate =
+      get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
+
+  uint32_t keybundle_size_per_input =
+      lwe_chunk_size * level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2);
+
+  //
+  void *kernel_args[18];
+  kernel_args[0] = &lwe_array_out;
+  kernel_args[1] = &lwe_output_indexes;
+  kernel_args[2] = &lut_vector;
+  kernel_args[3] = &lut_vector_indexes;
+  kernel_args[4] = &lwe_array_in;
+  kernel_args[5] = &lwe_input_indexes;
+  kernel_args[6] = &keybundle_fft;
+  kernel_args[7] = &buffer_fft;
+  kernel_args[8] = &global_accumulator;
+  kernel_args[9] = &lwe_dimension;
+  kernel_args[10] = &glwe_dimension;
+  kernel_args[11] = &polynomial_size;
+  kernel_args[12] = &base_log;
+  kernel_args[13] = &level_count;
+  kernel_args[14] = &grouping_factor;
+  kernel_args[17] = &keybundle_size_per_input;
+
+  //
+  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    uint32_t chunk_size = std::min(
+        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+
+    // Compute a keybundle
+    dim3 grid_keybundle(num_samples * chunk_size,
+                        (glwe_dimension + 1) * (glwe_dimension + 1),
+                        level_count);
+    device_multi_bit_bootstrap_keybundle<Torus, params>
+        <<<grid_keybundle, thds, full_sm_keybundle, stream->stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            base_log, level_count, lwe_offset, chunk_size,
+            keybundle_size_per_input);
+    check_cuda_error(cudaGetLastError());
+
+    kernel_args[15] = &lwe_offset;
+    kernel_args[16] = &chunk_size;
+
+    check_cuda_error(cudaLaunchCooperativeKernel(
+        (void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
+        grid_accumulate, thds, (void **)kernel_args, full_sm_accumulate,
+        stream->stream));
+  }
+}
+
+// Verify if the grid size for the low latency kernel satisfies the cooperative
+// group constraints
+template <typename Torus, class params>
+__host__ bool
+verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
+                                               int level_count, int num_samples,
+                                               uint32_t max_shared_memory) {
+
+  // If Cooperative Groups is not supported, no need to check anything else
+  if (!cuda_check_support_cooperative_groups())
+    return false;
+
+  // Calculate the dimension of the kernel
+  uint64_t full_sm =
+      get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(params::degree);
+
+  int thds = params::degree / params::opt;
+
+  // Get the maximum number of active blocks per streaming multiprocessors
+  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
+  int max_active_blocks_per_sm;
+
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_active_blocks_per_sm,
+      (void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>, thds,
+      full_sm);
+
+  // Get the number of streaming multiprocessors
+  int number_of_sm = 0;
+  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
+  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
+}
+#endif // FASTMULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cu
@@ -0,0 +1,845 @@
+#include "bootstrap_fast_low_latency.cuh"
+#include "bootstrap_low_latency.cuh"
+/*
+ * Returns the buffer size for 64 bits executions
+ */
+uint64_t get_buffer_size_bootstrap_low_latency_64(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<256>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    else
+      return get_buffer_size_bootstrap_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    break;
+  case 512:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<512>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    else
+      return get_buffer_size_bootstrap_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    break;
+  case 1024:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<1024>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    else
+      return get_buffer_size_bootstrap_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    break;
+  case 2048:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<2048>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    else
+      return get_buffer_size_bootstrap_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    break;
+  case 4096:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<4096>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    else
+      return get_buffer_size_bootstrap_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    break;
+  case 8192:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<8192>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    else
+      return get_buffer_size_bootstrap_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    break;
+  case 16384:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
+            uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                              input_lwe_ciphertext_count,
+                                              max_shared_memory))
+      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    else
+      return get_buffer_size_bootstrap_low_latency<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory);
+    break;
+  default:
+    return 0;
+    break;
+  }
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
+                                       int polynomial_size, int num_samples) {
+
+  assert((
+      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
+      "1024, 2048, 4096, 8192, 16384",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192 ||
+          polynomial_size == 16384));
+}
+
+/*
+ * Runs standard checks to validate the inputs
+ */
+void checks_bootstrap_low_latency(int nbits, int glwe_dimension,
+                                  int level_count, int base_log,
+                                  int polynomial_size, int num_samples) {
+  assert(("Error (GPU low latency PBS): base log should be <= nbits",
+          base_log <= nbits));
+  checks_fast_bootstrap_low_latency(glwe_dimension, level_count,
+                                    polynomial_size, num_samples);
+}
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
+ * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
+ * be used.
+ */
+void scratch_cuda_bootstrap_low_latency_32(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory) {
+  checks_fast_bootstrap_low_latency(
+      glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
+
+  switch (polynomial_size) {
+  case 256:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<256>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
+                                         AmortizedDegree<256>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<256>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 512:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<512>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
+                                         AmortizedDegree<512>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<512>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 2048:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<2048>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
+                                         AmortizedDegree<2048>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<2048>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 4096:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<4096>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
+                                         AmortizedDegree<4096>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<4096>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 8192:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<8192>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
+                                         AmortizedDegree<8192>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<8192>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 16384:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
+            uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                              input_lwe_ciphertext_count,
+                                              max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
+                                         AmortizedDegree<16384>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the low_latency PBS on 64 bits inputs, into `pbs_buffer`. It also
+ * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
+ * be used.
+ */
+void scratch_cuda_bootstrap_low_latency_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory) {
+
+  checks_fast_bootstrap_low_latency(
+      glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
+
+  switch (polynomial_size) {
+  case 256:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<256>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
+                                         AmortizedDegree<256>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<256>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 512:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<512>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
+                                         AmortizedDegree<512>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<512>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 1024:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<1024>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
+                                         AmortizedDegree<1024>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<1024>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 2048:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<2048>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
+                                         AmortizedDegree<2048>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<2048>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 4096:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<4096>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
+                                         AmortizedDegree<4096>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<4096>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 8192:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<8192>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
+                                         AmortizedDegree<8192>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<8192>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  case 16384:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
+            uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                              input_lwe_ciphertext_count,
+                                              max_shared_memory))
+      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
+                                         AmortizedDegree<16384>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    else
+      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
+          stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
+ * This function performs best for small numbers of inputs. Beyond a certain
+ * number of inputs (the exact number depends on the cryptographic parameters),
+ * the kernel cannot be launched and it is necessary to split the kernel call
+ * into several calls on smaller batches of inputs. For more details on this
+ * operation, head on to the equivalent u64 operation.
+ */
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
+
+  checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log,
+                               polynomial_size, num_samples);
+
+  switch (polynomial_size) {
+  case 256:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<256>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<256>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint32_t, Degree<256>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 512:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<512>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<512>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint32_t, Degree<512>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 1024:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<1024>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<1024>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint32_t, Degree<1024>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 2048:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<2048>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<2048>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint32_t, Degree<2048>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 4096:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<4096>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<4096>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint32_t, Degree<4096>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 8192:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<8192>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<8192>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint32_t, Degree<8192>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 16384:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
+            uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                              num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<16384>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint32_t, Degree<16384>>(
+          stream, static_cast<uint32_t *>(lwe_array_out),
+          static_cast<uint32_t *>(lwe_output_indexes),
+          static_cast<uint32_t *>(lut_vector),
+          static_cast<uint32_t *>(lut_vector_indexes),
+          static_cast<uint32_t *>(lwe_array_in),
+          static_cast<uint32_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
+ * This function performs best for small numbers of inputs. Beyond a certain
+ * number of inputs (the exact number depends on the cryptographic parameters),
+ * the kernel cannot be launched and it is necessary to split the kernel call
+ * into several calls on smaller batches of inputs.
+ *
+ * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many luts of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_luts vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which lut to use for each sample in
+ * lut_vector
+ *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the lut
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * - lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - glwe_dimension: size of the polynomial vector used to encrypt the LUT
+ * GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
+ * this parameter.
+ * - polynomial_size: size of the test polynomial (lut) and size of the
+ * GLWE polynomial (~1024)
+ * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ * - level_count: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ * - num_luts: parameter to set the actual number of luts to be
+ * used
+ * - lwe_idx: the index of the LWE input to consider for the GPU of index
+ * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
+ * the input LWE array is copied to each GPU, but the whole LUT array is copied
+ * (because the case when the number of LUTs is smaller than the number of input
+ * LWEs is not trivial to take into account in the data repartition on the
+ * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
+ * input in the LUT array `lut_vector`.
+ *  - 'max_shared_memory' maximum amount of shared memory to be used inside
+ * device functions
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples * level_count * (glwe_dimension + 1) blocks of threads are
+ * launched, where each thread	is going to handle one or more polynomial
+ * coefficients at each stage, for a given level of decomposition, either for
+ * the LUT mask or its body:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- get the decomposition for the current level
+ * 		- switch to the FFT domain
+ * 		- multiply with the bootstrapping key
+ * 		- come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary (some
+ * synchronizations happen at the block level, some happen between blocks, using
+ * cooperative groups).
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
+  checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log,
+                               polynomial_size, num_samples);
+
+  switch (polynomial_size) {
+  case 256:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<256>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<256>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint64_t, Degree<256>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 512:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
+                                                         AmortizedDegree<512>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<512>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint64_t, Degree<512>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 1024:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<1024>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<1024>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint64_t, Degree<1024>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 2048:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<2048>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<2048>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint64_t, Degree<2048>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 4096:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<4096>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<4096>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint64_t, Degree<4096>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 8192:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
+                                                         AmortizedDegree<8192>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<8192>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint64_t, Degree<8192>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    break;
+  case 16384:
+    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
+            uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                              num_samples, max_shared_memory))
+      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<16384>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+    else
+      host_bootstrap_low_latency<uint64_t, Degree<16384>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<double2 *>(bootstrapping_key), pbs_buffer, glwe_dimension,
+          lwe_dimension, polynomial_size, base_log, level_count, num_samples,
+          num_luts, max_shared_memory);
+  default:
+    break;
+  }
+}
+
+/*
+ * This cleanup function frees the data for the low latency PBS on GPU in
+ * pbs_buffer for 32 or 64 bits inputs.
+ */
+void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
+                                        int8_t **pbs_buffer) {
+  // Free memory
+  cuda_drop_async(*pbs_buffer, stream);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_low_latency.cuh
@@ -0,0 +1,487 @@
+#ifndef CUDA_LOWLAT_PBS_CUH
+#define CUDA_LOWLAT_PBS_CUH
+
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#endif
+
+#include "bootstrap.h"
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void device_bootstrap_low_latency_step_one(
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    Torus *global_accumulator, double2 *global_accumulator_fft,
+    uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, int8_t *device_mem,
+    uint64_t device_memory_size_per_block) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  Torus *accumulator = (Torus *)selected_memory;
+  double2 *accumulator_fft =
+      (double2 *)accumulator +
+      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = (double2 *)sharedmem;
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
+
+  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
+                                        params::degree * (glwe_dimension + 1)];
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+
+  double2 *global_fft_slice =
+      global_accumulator_fft +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
+       blockIdx.z * level_count * (glwe_dimension + 1)) *
+          (polynomial_size / 2);
+
+  if (lwe_iteration == 0) {
+    // First iteration
+    // Put "b" in [0, 2N[
+    Torus b_hat = 0;
+    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                          2 * params::degree);
+    // The y-dimension is used to select the element of the GLWE this block will
+    // compute
+    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                          params::degree / params::opt>(
+        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+        false);
+
+    // Persist
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      global_slice[tid] = accumulator[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+
+  // Put "a" in [0, 2N[
+  Torus a_hat = 0;
+  rescale_torus_element(block_lwe_array_in[lwe_iteration], a_hat,
+                        2 * params::degree); // 2 * params::log2_degree + 1);
+
+  synchronize_threads_in_block();
+
+  // Perform ACC * (X^ä - 1)
+  multiply_by_monomial_negacyclic_and_sub_polynomial<
+      Torus, params::opt, params::degree / params::opt>(global_slice,
+                                                        accumulator, a_hat);
+
+  // Perform a rounding to increase the accuracy of the
+  // bootstrapped ciphertext
+  round_to_closest_multiple_inplace<Torus, params::opt,
+                                    params::degree / params::opt>(
+      accumulator, base_log, level_count);
+
+  synchronize_threads_in_block();
+
+  // Decompose the accumulator. Each block gets one level of the
+  // decomposition, for the mask and the body (so block 0 will have the
+  // accumulator decomposed at level 0, 1 at 1, etc.)
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
+
+  // We are using the same memory space for accumulator_fft and
+  // accumulator_rotated, so we need to synchronize here to make sure they
+  // don't modify the same memory space at the same time
+  // Switch to the FFT space
+  NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    global_fft_slice[tid] = accumulator_fft[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
+template <typename Torus, class params, sharedMemDegree SMD>
+__global__ void device_bootstrap_low_latency_step_two(
+    Torus *lwe_array_out, Torus *lwe_output_indexes, Torus *lut_vector,
+    Torus *lut_vector_indexes, double2 *bootstrapping_key,
+    Torus *global_accumulator, double2 *global_accumulator_fft,
+    uint32_t lwe_iteration, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, int8_t *device_mem,
+    uint64_t device_memory_size_per_block) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+  uint32_t glwe_dimension = gridDim.y - 1;
+
+  if constexpr (SMD == FULLSM) {
+    selected_memory = sharedmem;
+  } else {
+    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
+                      blockIdx.z * gridDim.x * gridDim.y;
+    selected_memory = &device_mem[block_index * device_memory_size_per_block];
+  }
+
+  // We always compute the pointer with most restrictive alignment to avoid
+  // alignment issues
+  double2 *accumulator_fft = (double2 *)selected_memory;
+  Torus *accumulator =
+      (Torus *)accumulator_fft +
+      (ptrdiff_t)(sizeof(double2) * params::degree / 2 / sizeof(Torus));
+
+  if constexpr (SMD == PARTIALSM)
+    accumulator_fft = (double2 *)sharedmem;
+
+  for (int level = 0; level < level_count; level++) {
+    double2 *global_fft_slice = global_accumulator_fft +
+                                (level + blockIdx.x * level_count) *
+                                    (glwe_dimension + 1) * (params::degree / 2);
+
+    for (int j = 0; j < (glwe_dimension + 1); j++) {
+      double2 *fft = global_fft_slice + j * params::degree / 2;
+
+      // Get the bootstrapping key piece necessary for the multiplication
+      // It is already in the Fourier domain
+      auto bsk_slice =
+          get_ith_mask_kth_block(bootstrapping_key, lwe_iteration, j, level,
+                                 polynomial_size, glwe_dimension, level_count);
+      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+
+      polynomial_product_accumulate_in_fourier_domain<params, double2>(
+          accumulator_fft, fft, bsk_poly, !level && !j);
+    }
+  }
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+
+  // Load the persisted accumulator
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt; i++) {
+    accumulator[tid] = global_slice[tid];
+    tid += params::degree / params::opt;
+  }
+
+  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
+  // accumulator
+  NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+  add_to_torus<Torus, params>(accumulator_fft, accumulator);
+
+  if (lwe_iteration + 1 == lwe_dimension) {
+    // Last iteration
+    auto block_lwe_array_out =
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                           (glwe_dimension * polynomial_size + 1) +
+                       blockIdx.y * polynomial_size];
+
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    }
+  } else {
+    // Persist the updated accumulator
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      global_slice[tid] = accumulator[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_bootstrap_low_latency_step_one(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_bootstrap_low_latency_step_two(
+    uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size +      // accumulator
+         sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
+  return sizeof(double2) * polynomial_size / 2; // accumulator fft
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
+          polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
+
+  uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+  uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+  uint64_t full_dm = full_sm_step_one;
+
+  uint64_t device_mem = 0;
+  if (max_shared_memory < partial_sm) {
+    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                 (glwe_dimension + 1);
+  } else if (max_shared_memory < full_sm_step_two) {
+    device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
+                 input_lwe_ciphertext_count * (glwe_dimension + 1);
+  } else if (max_shared_memory < full_sm_step_one) {
+    device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                 level_count * (glwe_dimension + 1);
+  }
+  // Otherwise, both kernels run all in shared memory
+  uint64_t buffer_size = device_mem +
+                         // global_accumulator_fft
+                         (glwe_dimension + 1) * level_count *
+                             input_lwe_ciphertext_count *
+                             (polynomial_size / 2) * sizeof(double2) +
+                         // global_accumulator
+                         (glwe_dimension + 1) * input_lwe_ciphertext_count *
+                             polynomial_size * sizeof(Torus);
+  return buffer_size + buffer_size % sizeof(double2);
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void scratch_bootstrap_low_latency(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    bool allocate_gpu_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
+          polynomial_size);
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
+
+  // Configure step one
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
+    cudaFuncSetCacheConfig(
+        device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  // Configure step two
+  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
+    cudaFuncSetCacheConfig(
+        device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  } else if (max_shared_memory >= partial_sm) {
+    check_cuda_error(cudaFuncSetAttribute(
+        device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
+    cudaFuncSetCacheConfig(
+        device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared);
+    check_cuda_error(cudaGetLastError());
+  }
+
+  if (allocate_gpu_memory) {
+    uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, max_shared_memory);
+    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
+    check_cuda_error(cudaGetLastError());
+  }
+}
+
+template <typename Torus, class params>
+__host__ void execute_low_latency_step_one(
+    cuda_stream_t *stream, Torus *lut_vector, Torus *lut_vector_indexes,
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
+    Torus *global_accumulator, double2 *global_accumulator_fft,
+    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
+    int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
+    uint64_t full_sm, uint64_t full_dm) {
+
+  int thds = polynomial_size / params::opt;
+  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
+
+  if (max_shared_memory < partial_sm) {
+    device_bootstrap_low_latency_step_one<Torus, params, NOSM>
+        <<<grid, thds, 0, stream->stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, full_dm);
+  } else if (max_shared_memory < full_sm) {
+    device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>
+        <<<grid, thds, partial_sm, stream->stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, partial_dm);
+  } else {
+    device_bootstrap_low_latency_step_one<Torus, params, FULLSM>
+        <<<grid, thds, full_sm, stream->stream>>>(
+            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus, class params>
+__host__ void execute_low_latency_step_two(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, double2 *bootstrapping_key,
+    Torus *global_accumulator, double2 *global_accumulator_fft,
+    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
+    int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
+    uint64_t full_sm, uint64_t full_dm) {
+
+  int thds = polynomial_size / params::opt;
+  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
+
+  if (max_shared_memory < partial_sm) {
+    device_bootstrap_low_latency_step_two<Torus, params, NOSM>
+        <<<grid, thds, 0, stream->stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, full_dm);
+  } else if (max_shared_memory < full_sm) {
+    device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>
+        <<<grid, thds, partial_sm, stream->stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, partial_dm);
+  } else {
+    device_bootstrap_low_latency_step_two<Torus, params, FULLSM>
+        <<<grid, thds, full_sm, stream->stream>>>(
+            lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
+            bootstrapping_key, global_accumulator, global_accumulator_fft,
+            lwe_iteration, lwe_dimension, polynomial_size, base_log,
+            level_count, d_mem, 0);
+  }
+  check_cuda_error(cudaGetLastError());
+}
+/*
+ * Host wrapper to the low latency version
+ * of bootstrapping
+ */
+template <typename Torus, class params>
+__host__ void host_bootstrap_low_latency(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t num_luts,
+    uint32_t max_shared_memory) {
+  cudaSetDevice(stream->gpu_index);
+
+  // With SM each block corresponds to either the mask or body, no need to
+  // duplicate data for each
+  uint64_t full_sm_step_one =
+      get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
+          polynomial_size);
+  uint64_t full_sm_step_two =
+      get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
+          polynomial_size);
+
+  uint64_t partial_sm =
+      get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
+
+  uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+  uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+  uint64_t full_dm_step_one = full_sm_step_one;
+  uint64_t full_dm_step_two = full_sm_step_two;
+
+  double2 *global_accumulator_fft = (double2 *)pbs_buffer;
+  Torus *global_accumulator =
+      (Torus *)global_accumulator_fft +
+      (ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
+                  input_lwe_ciphertext_count * (polynomial_size / 2) /
+                  sizeof(Torus));
+  int8_t *d_mem = (int8_t *)global_accumulator +
+                  (ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
+                              input_lwe_ciphertext_count * polynomial_size /
+                              sizeof(int8_t));
+
+  for (int i = 0; i < lwe_dimension; i++) {
+    execute_low_latency_step_one<Torus, params>(
+        stream, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+        bootstrapping_key, global_accumulator, global_accumulator_fft,
+        input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
+        polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
+        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
+    execute_low_latency_step_two<Torus, params>(
+        stream, lwe_array_out, lwe_output_indexes, lut_vector,
+        lut_vector_indexes, bootstrapping_key, global_accumulator,
+        global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem,
+        max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two,
+        full_dm_step_two);
+  }
+}
+
+#endif // LOWLAT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cu
@@ -0,0 +1,486 @@
+#include "../polynomial/parameters.cuh"
+#include "bootstrap_fast_multibit.cuh"
+#include "bootstrap_multibit.cuh"
+#include "bootstrap_multibit.h"
+
+void checks_multi_bit_pbs(int polynomial_size) {
+  assert(
+      ("Error (GPU multi-bit PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192, 16384",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192 ||
+           polynomial_size == 16384));
+}
+
+void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
+    cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
+    void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
+    void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
+
+  checks_multi_bit_pbs(polynomial_size);
+
+  switch (polynomial_size) {
+  case 256:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<256>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory)) {
+      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    } else {
+      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    }
+    break;
+  case 512:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<512>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory)) {
+      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    } else {
+      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    }
+    break;
+  case 1024:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<1024>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory)) {
+      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    } else {
+      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    }
+    break;
+  case 2048:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<2048>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory)) {
+      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    } else {
+      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    }
+    break;
+  case 4096:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<4096>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory)) {
+      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    } else {
+      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    }
+    break;
+  case 8192:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<8192>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory)) {
+      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    } else {
+      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    }
+    break;
+  case 16384:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<16384>>(
+            glwe_dimension, level_count, num_samples, max_shared_memory)) {
+      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    } else {
+      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
+          stream, static_cast<uint64_t *>(lwe_array_out),
+          static_cast<uint64_t *>(lwe_output_indexes),
+          static_cast<uint64_t *>(lut_vector),
+          static_cast<uint64_t *>(lut_vector_indexes),
+          static_cast<uint64_t *>(lwe_array_in),
+          static_cast<uint64_t *>(lwe_input_indexes),
+          static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
+          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
+          base_log, level_count, num_samples, num_luts, lwe_idx,
+          max_shared_memory, lwe_chunk_size);
+    }
+    break;
+  default:
+    break;
+  }
+}
+
+void scratch_cuda_multi_bit_pbs_64(
+    cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
+    uint32_t max_shared_memory, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size) {
+
+  switch (polynomial_size) {
+  case 256:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<256>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory)) {
+      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    } else {
+      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    }
+    break;
+  case 512:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<512>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory)) {
+      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    } else {
+      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    }
+    break;
+  case 1024:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<1024>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory)) {
+      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    } else {
+      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    }
+    break;
+  case 2048:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<2048>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory)) {
+      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    } else {
+      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    }
+    break;
+  case 4096:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<4096>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory)) {
+      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    } else {
+      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    }
+    break;
+  case 8192:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<8192>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory)) {
+      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    } else {
+      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    }
+    break;
+  case 16384:
+    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
+                                                       AmortizedDegree<16384>>(
+            glwe_dimension, level_count, input_lwe_ciphertext_count,
+            max_shared_memory)) {
+      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    } else {
+      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
+          stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          level_count, input_lwe_ciphertext_count, grouping_factor,
+          max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+    }
+    break;
+  default:
+    break;
+  }
+}
+
+void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer) {
+
+  // Free memory
+  cuda_drop_async(*pbs_buffer, stream);
+}
+
+// Pick the best possible chunk size for each GPU
+__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
+                                     uint32_t level_count,
+                                     uint32_t glwe_dimension,
+                                     uint32_t num_samples) {
+
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
+
+  const char *v100Name = "V100"; // Known name of V100 GPU
+  const char *a100Name = "A100"; // Known name of A100 GPU
+  const char *h100Name = "H100"; // Known name of H100 GPU
+
+  if (std::strstr(deviceProp.name, v100Name) != nullptr) {
+    // Tesla V100
+    if (num_samples == 1)
+      return 60;
+    else if (num_samples == 2)
+      return 40;
+    else if (num_samples <= 4)
+      return 20;
+    else if (num_samples <= 8)
+      return 10;
+    else if (num_samples <= 16)
+      return 40;
+    else if (num_samples <= 32)
+      return 27;
+    else if (num_samples <= 64)
+      return 20;
+    else if (num_samples <= 128)
+      return 18;
+    else if (num_samples <= 256)
+      return 16;
+    else if (num_samples <= 512)
+      return 15;
+    else if (num_samples <= 1024)
+      return 15;
+    else
+      return 12;
+  } else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
+    // Tesla A100
+    if (num_samples < 4)
+      return 11;
+    else if (num_samples < 8)
+      return 6;
+    else if (num_samples < 16)
+      return 13;
+    else if (num_samples < 64)
+      return 19;
+    else if (num_samples < 128)
+      return 1;
+    else if (num_samples < 512)
+      return 19;
+    else if (num_samples < 1024)
+      return 17;
+    else if (num_samples < 8192)
+      return 19;
+    else if (num_samples < 16384)
+      return 12;
+    else
+      return 9;
+  } else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
+    // Tesla H100
+    return 45;
+  }
+
+  // Generic case
+  return 1;
+}
+
+// Returns a chunk size that is not optimal but close to
+__host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
+                                             uint32_t level_count,
+                                             uint32_t glwe_dimension,
+                                             uint32_t ct_count) {
+
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
+
+  const char *v100Name = "V100"; // Known name of V100 GPU
+  const char *a100Name = "A100"; // Known name of A100 GPU
+  const char *h100Name = "H100"; // Known name of H100 GPU
+
+  if (std::strstr(deviceProp.name, v100Name) != nullptr) {
+    // Tesla V100
+    return (ct_count > 10000) ? 12 : 18;
+  } else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
+    // Tesla A100
+    return (ct_count > 10000) ? 30 : 45;
+  } else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
+    // Tesla H100
+    return (ct_count > 10000) ? 30 : 45;
+  }
+
+  // Generic case
+  return (ct_count > 10000) ? 2 : 10;
+}
+
+// Returns the maximum buffer size required to execute batches up to
+// max_input_lwe_ciphertext_count
+// todo: Deprecate this function
+__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t max_input_lwe_ciphertext_count) {
+
+  uint64_t max_buffer_size = 0;
+  for (uint32_t input_lwe_ciphertext_count = 1;
+       input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
+       input_lwe_ciphertext_count *= 2) {
+    max_buffer_size = std::max(
+        max_buffer_size,
+        get_buffer_size_multibit_bootstrap<uint64_t>(
+            glwe_dimension, polynomial_size, level_count,
+            input_lwe_ciphertext_count,
+            get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
+                               input_lwe_ciphertext_count)));
+  }
+
+  return max_buffer_size;
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrap_multibit.cuh
@@ -0,0 +1,477 @@
+#ifndef CUDA_MULTIBIT_PBS_CUH
+#define CUDA_MULTIBIT_PBS_CUH
+
+#include "bootstrap.h"
+#include "bootstrap_fast_low_latency.cuh"
+#include "bootstrap_multibit.h"
+#include "cooperative_groups.h"
+#include "crypto/gadget.cuh"
+#include "crypto/ggsw.cuh"
+#include "crypto/torus.cuh"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "types/complex/operations.cuh"
+#include <vector>
+
+template <typename Torus, class params>
+__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
+                                            uint32_t ggsw_idx,
+                                            uint32_t grouping_factor) {
+  Torus x = 0;
+  for (int i = 0; i < grouping_factor; i++) {
+    uint32_t mask_position = grouping_factor - (i + 1);
+    int selection_bit = (ggsw_idx >> mask_position) & 1;
+    x += selection_bit * lwe_array_group[i];
+  }
+
+  return rescale_torus_element(
+      x, 2 * params::degree); // 2 * params::log2_degree + 1);
+}
+
+template <typename Torus, class params>
+__global__ void device_multi_bit_bootstrap_keybundle(
+    Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *keybundle_array,
+    Torus *bootstrapping_key, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t lwe_offset, uint32_t lwe_chunk_size,
+    uint32_t keybundle_size_per_input) {
+
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory = sharedmem;
+
+  // Ids
+  uint32_t level_id = blockIdx.z;
+  uint32_t glwe_id = blockIdx.y / (glwe_dimension + 1);
+  uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
+  uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
+  uint32_t input_idx = blockIdx.x / lwe_chunk_size;
+
+  if (lwe_iteration < (lwe_dimension / grouping_factor)) {
+    //
+    Torus *accumulator = (Torus *)selected_memory;
+
+    Torus *block_lwe_array_in =
+        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
+
+    double2 *keybundle = keybundle_array +
+                         // select the input
+                         input_idx * keybundle_size_per_input;
+
+    ////////////////////////////////////////////////////////////
+    // Computes all keybundles
+    uint32_t rev_lwe_iteration =
+        ((lwe_dimension / grouping_factor) - lwe_iteration - 1);
+
+    // ////////////////////////////////
+    // Keygen guarantees the first term is a constant term of the polynomial, no
+    // polynomial multiplication required
+    Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
+        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
+        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
+    Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        bsk_poly, accumulator);
+
+    // Accumulate the other terms
+    for (int g = 1; g < (1 << grouping_factor); g++) {
+
+      Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
+          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
+          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
+      Torus *bsk_poly = bsk_slice + poly_id * params::degree;
+
+      // Calculates the monomial degree
+      Torus *lwe_array_group =
+          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
+      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
+          lwe_array_group, g, grouping_factor);
+
+      synchronize_threads_in_block();
+      // Multiply by the bsk element
+      polynomial_product_accumulate_by_monomial<Torus, params>(
+          accumulator, bsk_poly, monomial_degree, false);
+    }
+
+    synchronize_threads_in_block();
+
+    double2 *fft = (double2 *)sharedmem;
+
+    // Move accumulator to local memory
+    double2 temp[params::opt / 2];
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
+      temp[i].y =
+          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
+      temp[i].x /= (double)std::numeric_limits<Torus>::max();
+      temp[i].y /= (double)std::numeric_limits<Torus>::max();
+      tid += params::degree / params::opt;
+    }
+
+    synchronize_threads_in_block();
+    // Move from local memory back to shared memory but as complex
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      fft[tid] = temp[i];
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    NSMFFT_direct<HalfDegree<params>>(fft);
+
+    // lwe iteration
+    auto keybundle_out = get_ith_mask_kth_block(
+        keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
+        polynomial_size, glwe_dimension, level_count);
+    auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
+
+    copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
+        fft, keybundle_poly);
+  }
+}
+
+template <typename Torus, class params>
+__global__ void device_multi_bit_bootstrap_accumulate_step_one(
+    Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t lwe_iteration) {
+
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  selected_memory = sharedmem;
+
+  Torus *accumulator = (Torus *)selected_memory;
+  double2 *accumulator_fft =
+      (double2 *)accumulator +
+      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
+
+  Torus *block_lwe_array_in =
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
+
+  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
+                                        params::degree * (glwe_dimension + 1)];
+
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
+
+  double2 *global_fft_slice =
+      global_accumulator_fft +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
+       blockIdx.z * level_count * (glwe_dimension + 1)) *
+          (polynomial_size / 2);
+
+  if (lwe_iteration == 0) {
+    // First iteration
+    ////////////////////////////////////////////////////////////
+    // Initializes the accumulator with the body of LWE
+    // Put "b" in [0, 2N[
+    Torus b_hat = 0;
+    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                          2 * params::degree);
+
+    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+                                          params::degree / params::opt>(
+        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
+        false);
+
+    // Persist
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        accumulator, global_slice);
+  } else {
+    // Load the accumulator calculated in previous iterations
+    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
+        global_slice, accumulator);
+  }
+
+  // Perform a rounding to increase the accuracy of the
+  // bootstrapped ciphertext
+  round_to_closest_multiple_inplace<Torus, params::opt,
+                                    params::degree / params::opt>(
+      accumulator, base_log, level_count);
+
+  // Decompose the accumulator. Each block gets one level of the
+  // decomposition, for the mask and the body (so block 0 will have the
+  // accumulator decomposed at level 0, 1 at 1, etc.)
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
+                                                    blockIdx.x);
+
+  // We are using the same memory space for accumulator_fft and
+  // accumulator_rotated, so we need to synchronize here to make sure they
+  // don't modify the same memory space at the same time
+  // Switch to the FFT space
+  NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+
+  copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
+      accumulator_fft, global_fft_slice);
+}
+
+template <typename Torus, class params>
+__global__ void device_multi_bit_bootstrap_accumulate_step_two(
+    Torus *lwe_array_out, Torus *lwe_output_indexes, double2 *keybundle_array,
+    Torus *global_accumulator, double2 *global_accumulator_fft,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t grouping_factor, uint32_t iteration,
+    uint32_t lwe_offset, uint32_t lwe_chunk_size) {
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ int8_t sharedmem[];
+  int8_t *selected_memory;
+
+  selected_memory = sharedmem;
+  double2 *accumulator_fft = (double2 *)selected_memory;
+
+  double2 *keybundle = keybundle_array +
+                       // select the input
+                       blockIdx.x * lwe_chunk_size * level_count *
+                           (glwe_dimension + 1) * (glwe_dimension + 1) *
+                           (polynomial_size / 2);
+
+  double2 *global_accumulator_fft_input =
+      global_accumulator_fft +
+      blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
+
+  for (int level = 0; level < level_count; level++) {
+    double2 *global_fft_slice =
+        global_accumulator_fft_input +
+        level * (glwe_dimension + 1) * (polynomial_size / 2);
+
+    for (int j = 0; j < (glwe_dimension + 1); j++) {
+      double2 *fft = global_fft_slice + j * params::degree / 2;
+
+      // Get the bootstrapping key piece necessary for the multiplication
+      // It is already in the Fourier domain
+      auto bsk_slice =
+          get_ith_mask_kth_block(keybundle, iteration, j, level,
+                                 polynomial_size, glwe_dimension, level_count);
+      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
+
+      polynomial_product_accumulate_in_fourier_domain<params, double2>(
+          accumulator_fft, fft, bsk_poly, !level && !j);
+    }
+  }
+
+  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
+  // accumulator
+  NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+  Torus *global_slice =
+      global_accumulator +
+      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
+
+  add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
+  synchronize_threads_in_block();
+
+  uint32_t lwe_iteration = iteration + lwe_offset;
+  if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
+    // Last iteration
+    auto block_lwe_array_out =
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
+                           (glwe_dimension * polynomial_size + 1) +
+                       blockIdx.y * polynomial_size];
+
+    if (blockIdx.y < glwe_dimension) {
+      // Perform a sample extract. At this point, all blocks have the result,
+      // but we do the computation at block 0 to avoid waiting for extra blocks,
+      // in case they're not synchronized
+      sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
+    } else if (blockIdx.y == glwe_dimension) {
+      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
+    }
+  }
+}
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_multibit_bootstrap_keybundle(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size; // accumulator
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_multibit_bootstrap_step_one(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size * 2; // accumulator
+}
+template <typename Torus>
+__host__ __device__ uint64_t
+get_buffer_size_full_sm_multibit_bootstrap_step_two(uint32_t polynomial_size) {
+  return sizeof(Torus) * polynomial_size; // accumulator
+}
+
+template <typename Torus>
+__host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
+
+  uint64_t buffer_size = 0;
+  buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
+                 (glwe_dimension + 1) * (glwe_dimension + 1) *
+                 (polynomial_size / 2) * sizeof(double2); // keybundle fft
+  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
+                 level_count * (polynomial_size / 2) *
+                 sizeof(double2); // global_accumulator_fft
+  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
+                 polynomial_size * sizeof(Torus); // global_accumulator
+
+  return buffer_size + buffer_size % sizeof(double2);
+}
+
+template <typename Torus, typename STorus, typename params>
+__host__ void
+scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
+                      uint32_t lwe_dimension, uint32_t glwe_dimension,
+                      uint32_t polynomial_size, uint32_t level_count,
+                      uint32_t input_lwe_ciphertext_count,
+                      uint32_t grouping_factor, uint32_t max_shared_memory,
+                      bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
+
+  cudaSetDevice(stream->gpu_index);
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate_step_one =
+      get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate_step_two =
+      get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
+          polynomial_size);
+
+  check_cuda_error(cudaFuncSetAttribute(
+      device_multi_bit_bootstrap_keybundle<Torus, params>,
+      cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
+  cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
+                         cudaFuncCachePreferShared);
+  check_cuda_error(cudaGetLastError());
+
+  check_cuda_error(cudaFuncSetAttribute(
+      device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
+      cudaFuncAttributeMaxDynamicSharedMemorySize,
+      full_sm_accumulate_step_one));
+  cudaFuncSetCacheConfig(
+      device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
+      cudaFuncCachePreferShared);
+  check_cuda_error(cudaGetLastError());
+
+  check_cuda_error(cudaFuncSetAttribute(
+      device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
+      cudaFuncAttributeMaxDynamicSharedMemorySize,
+      full_sm_accumulate_step_two));
+  cudaFuncSetCacheConfig(
+      device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
+      cudaFuncCachePreferShared);
+  check_cuda_error(cudaGetLastError());
+
+  if (allocate_gpu_memory) {
+    if (!lwe_chunk_size)
+      lwe_chunk_size =
+          get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
+                                     input_lwe_ciphertext_count);
+
+    uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, lwe_chunk_size);
+    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
+    check_cuda_error(cudaGetLastError());
+  }
+}
+
+template <typename Torus, typename STorus, class params>
+__host__ void host_multi_bit_pbs(
+    cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
+    Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
+    Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
+    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
+    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
+    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
+  cudaSetDevice(stream->gpu_index);
+
+  // If a chunk size is not passed to this function, select one.
+  if (!lwe_chunk_size)
+    lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
+                                                glwe_dimension, num_samples);
+  //
+  double2 *keybundle_fft = (double2 *)pbs_buffer;
+  double2 *global_accumulator_fft =
+      (double2 *)keybundle_fft +
+      num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
+          (glwe_dimension + 1) * (polynomial_size / 2);
+  Torus *global_accumulator =
+      (Torus *)global_accumulator_fft +
+      (ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
+                  level_count * (polynomial_size / 2) / sizeof(Torus));
+
+  //
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate_step_one =
+      get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
+          polynomial_size);
+  uint64_t full_sm_accumulate_step_two =
+      get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
+          polynomial_size);
+
+  uint32_t keybundle_size_per_input =
+      lwe_chunk_size * level_count * (glwe_dimension + 1) *
+      (glwe_dimension + 1) * (polynomial_size / 2);
+
+  //
+  dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
+  dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
+       lwe_offset += lwe_chunk_size) {
+
+    uint32_t chunk_size = std::min(
+        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+
+    // Compute a keybundle
+    dim3 grid_keybundle(num_samples * chunk_size,
+                        (glwe_dimension + 1) * (glwe_dimension + 1),
+                        level_count);
+    device_multi_bit_bootstrap_keybundle<Torus, params>
+        <<<grid_keybundle, thds, full_sm_keybundle, stream->stream>>>(
+            lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
+            lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+            base_log, level_count, lwe_offset, chunk_size,
+            keybundle_size_per_input);
+    check_cuda_error(cudaGetLastError());
+
+    // Accumulate
+    for (int j = 0; j < chunk_size; j++) {
+      device_multi_bit_bootstrap_accumulate_step_one<Torus, params>
+          <<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
+             stream->stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
+                               lut_vector_indexes, global_accumulator,
+                               global_accumulator_fft, lwe_dimension,
+                               glwe_dimension, polynomial_size, base_log,
+                               level_count, j + lwe_offset);
+      check_cuda_error(cudaGetLastError());
+
+      device_multi_bit_bootstrap_accumulate_step_two<Torus, params>
+          <<<grid_accumulate_step_two, thds, full_sm_accumulate_step_two,
+             stream->stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
+                               global_accumulator, global_accumulator_fft,
+                               lwe_dimension, glwe_dimension, polynomial_size,
+                               level_count, grouping_factor, j, lwe_offset,
+                               lwe_chunk_size);
+      check_cuda_error(cudaGetLastError());
+    }
+  }
+}
+#endif // MULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -0,0 +1,500 @@
+#ifndef CUDA_BSK_CUH
+#define CUDA_BSK_CUH
+
+#include "bootstrap.h"
+#include "bootstrap_multibit.h"
+#include "device.h"
+#include "fft/bnsmfft.cuh"
+#include "polynomial/parameters.cuh"
+#include <atomic>
+#include <cstdint>
+
+__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
+                                         int glwe_dimension,
+                                         uint32_t level_count) {
+  return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) *
+         level_count;
+}
+
+////////////////////////////////////////////////
+template <typename T>
+__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
+                                 level_count) +
+              level * polynomial_size / 2 * (glwe_dimension + 1) *
+                  (glwe_dimension + 1) +
+              k * polynomial_size / 2 * (glwe_dimension + 1)];
+}
+
+template <typename T>
+__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
+                                     uint32_t polynomial_size,
+                                     int glwe_dimension, uint32_t level_count) {
+  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
+                                 level_count) +
+              level * polynomial_size / 2 * (glwe_dimension + 1) *
+                  (glwe_dimension + 1) +
+              k * polynomial_size / 2 * (glwe_dimension + 1) +
+              glwe_dimension * polynomial_size / 2];
+}
+
+////////////////////////////////////////////////
+__device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
+                                        uint32_t polynomial_size,
+                                        uint32_t glwe_dimension,
+                                        uint32_t level_count) {
+  return i * (1 << grouping_factor) * polynomial_size / 2 *
+         (glwe_dimension + 1) * (glwe_dimension + 1) * level_count;
+}
+
+template <typename T>
+__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
+    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
+  T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
+                                         glwe_dimension, level_count);
+  return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
+                                glwe_dimension, level_count);
+}
+////////////////////////////////////////////////
+template <typename T, typename ST>
+void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
+                                    cuda_stream_t *stream,
+                                    uint32_t input_lwe_dim, uint32_t glwe_dim,
+                                    uint32_t level_count,
+                                    uint32_t polynomial_size,
+                                    uint32_t total_polynomials) {
+
+  cudaSetDevice(stream->gpu_index);
+  int shared_memory_size = sizeof(double) * polynomial_size;
+
+  // Here the buffer size is the size of double2 times the number of polynomials
+  // times the polynomial size over 2 because the polynomials are compressed
+  // into the complex domain to perform the FFT
+  size_t buffer_size =
+      total_polynomials * polynomial_size / 2 * sizeof(double2);
+
+  int gridSize = total_polynomials;
+  int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
+
+  double2 *h_bsk = (double2 *)malloc(buffer_size);
+
+  double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream);
+
+  // compress real bsk to complex and divide it on DOUBLE_MAX
+  for (int i = 0; i < total_polynomials; i++) {
+    int complex_current_poly_idx = i * polynomial_size / 2;
+    int torus_current_poly_idx = i * polynomial_size;
+    for (int j = 0; j < polynomial_size / 2; j++) {
+      h_bsk[complex_current_poly_idx + j].x = src[torus_current_poly_idx + j];
+      h_bsk[complex_current_poly_idx + j].y =
+          src[torus_current_poly_idx + j + polynomial_size / 2];
+      h_bsk[complex_current_poly_idx + j].x /=
+          (double)std::numeric_limits<T>::max();
+      h_bsk[complex_current_poly_idx + j].y /=
+          (double)std::numeric_limits<T>::max();
+    }
+  }
+
+  cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
+
+  double2 *buffer;
+  switch (polynomial_size) {
+  case 256:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              d_bsk, dest, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+    }
+    break;
+  case 512:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              d_bsk, dest, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+    }
+    break;
+  case 1024:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              d_bsk, dest, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+    }
+    break;
+  case 2048:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              d_bsk, dest, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+    }
+    break;
+  case 4096:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              d_bsk, dest, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+    }
+    break;
+  case 8192:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              d_bsk, dest, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+    }
+    break;
+  case 16384:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              d_bsk, dest, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(d_bsk, dest, buffer);
+    }
+    break;
+  default:
+    break;
+  }
+
+  cuda_drop_async(d_bsk, stream);
+  cuda_drop_async(buffer, stream);
+  free(h_bsk);
+}
+
+void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size) {
+  uint32_t total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
+      (double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
+      level_count, polynomial_size, total_polynomials);
+}
+
+void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
+                                       cuda_stream_t *stream,
+                                       uint32_t input_lwe_dim,
+                                       uint32_t glwe_dim, uint32_t level_count,
+                                       uint32_t polynomial_size) {
+  uint32_t total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
+  cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
+      (double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
+      level_count, polynomial_size, total_polynomials);
+}
+
+void cuda_convert_lwe_multi_bit_bootstrap_key_64(
+    void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
+    uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
+    uint32_t grouping_factor) {
+  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
+                               level_count * (1 << grouping_factor) /
+                               grouping_factor;
+  size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
+
+  cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
+                           stream);
+}
+
+void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
+                                 cuda_stream_t *stream,
+                                 uint32_t polynomial_size,
+                                 uint32_t total_polynomials) {
+
+  auto input1 = (double2 *)_input1;
+  auto input2 = (double2 *)_input2;
+  auto output = (double2 *)_output;
+
+  size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
+
+  int gridSize = total_polynomials;
+  int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
+
+  double2 *buffer;
+  switch (polynomial_size) {
+  case 256:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              input1, input2, output, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
+                                                       buffer);
+    }
+    break;
+  case 512:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              input1, input2, output, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
+                                                       buffer);
+    }
+    break;
+  case 1024:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              input1, input2, output, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
+                                                       buffer);
+    }
+    break;
+  case 2048:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              input1, input2, output, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
+                                                       buffer);
+    }
+    break;
+  case 4096:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              input1, input2, output, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
+                                                       buffer);
+    }
+    break;
+  case 8192:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              input1, input2, output, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
+                                                       buffer);
+    }
+    break;
+  case 16384:
+    if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
+                               FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
+                           FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, stream->stream>>>(
+              input1, input2, output, buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream);
+      batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, stream->stream>>>(input1, input2, output,
+                                                       buffer);
+    }
+    break;
+  default:
+    break;
+  }
+  cuda_drop_async(buffer, stream);
+}
+
+// We need these lines so the compiler knows how to specialize these functions
+template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
+                                                    int level,
+                                                    uint32_t polynomial_size,
+                                                    int glwe_dimension,
+                                                    uint32_t level_count);
+template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
+                                                     int k, int level,
+                                                     uint32_t polynomial_size,
+                                                     int glwe_dimension,
+                                                     uint32_t level_count);
+template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
+                                                    int level,
+                                                    uint32_t polynomial_size,
+                                                    int glwe_dimension,
+                                                    uint32_t level_count);
+
+template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
+    uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+
+template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
+    double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
+    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
+#endif // CNCRT_BSK_H
--- a/Show More
+++ b/Show More