chore(gpu): fix some pke bugs

chore(gpu): reduce the pool size
chore(gpu): add drift to pke gpu tests
2026-04-28 03:01:21 -04:00 · 2025-06-19 12:22:39 +02:00 · 2025-06-17 13:20:58 +02:00 · 2025-06-16 17:36:11 +02:00 · 2025-06-16 15:17:54 +02:00 · 2025-06-16 12:27:27 +02:00
653 changed files with 20712 additions and 29904 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1 @@
 *.hpu filter=lfs diff=lfs merge=lfs -text
-*.bcode filter=lfs diff=lfs merge=lfs -text
-*.cbor filter=lfs diff=lfs merge=lfs -text
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -23,7 +23,6 @@ runs:
        echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
        sha256sum -c checksum
        sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
-        sudo apt remove -y unattended-upgrades
        sudo apt update
        sudo apt install -y cmake-format libclang-dev
      env:
@@ -51,13 +50,11 @@ runs:
    - name: Export CUDA variables
      shell: bash
      run: |
-        find /usr/local -executable -name "nvcc"
        CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
        {
          echo "CUDA_PATH=$CUDA_PATH";
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH";
          echo "CUDA_MODULE_LOADER=EAGER";
-          echo "PATH=$PATH:$CUDA_PATH/bin"; 
        } >> "${GITHUB_ENV}"
        {
          echo "PATH=$PATH:$CUDA_PATH/bin"; 
@@ -77,11 +74,6 @@ runs:
      env:
        GCC_VERSION: ${{ inputs.gcc-version }}

-    - name: Check setup
-      shell: bash
-      run: |
-        which nvcc
-
    - name: Check device is detected
      shell: bash
      run: nvidia-smi
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -71,26 +71,45 @@ jobs:
        with:
          toolchain: stable

-      # Cache key is an aggregated hash of lfs files hashes
-      - name: Get LFS data sha
-        id: hash-lfs-data
+      - name: Use specific data branch
+        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
+        env:
+          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
        run: |
-          SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data | sha256sum | cut -d' ' -f1)
+          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
+
+      - name: Get backward compat branch
+        id: backward_compat_branch
+        run: |
+          BRANCH="$(make backward_compat_branch)"
+          echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
+
+      - name: Get backward compat branch head SHA
+        id: backward_compat_sha
+        run: |
+          SHA=$(git ls-remote "${REPO_URL}" refs/heads/"${BACKWARD_COMPAT_BRANCH}" | awk '{print $1}')
          echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
+        env:
+          REPO_URL: "https://github.com/zama-ai/tfhe-backward-compat-data"
+          BACKWARD_COMPAT_BRANCH: ${{ steps.backward_compat_branch.outputs.branch }}

      - name: Retrieve data from cache
        id: retrieve-data-cache
        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: |
-            utils/tfhe-backward-compat-data/**/*.cbor
-            utils/tfhe-backward-compat-data/**/*.bcode
-          key: ${{ steps.hash-lfs-data.outputs.sha }}
+          path: tests/tfhe-backward-compat-data
+          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}

-      - name: Pull test data
+      - name: Clone test data
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
-        run: |
-          make pull_backward_compat_data
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+          repository: zama-ai/tfhe-backward-compat-data
+          path: tests/tfhe-backward-compat-data
+          lfs: 'true'
+          ref: ${{ steps.backward_compat_branch.outputs.branch }}

      - name: Run backward compatibility tests
        run: |
@@ -101,10 +120,8 @@ jobs:
        continue-on-error: true
        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: |
-            utils/tfhe-backward-compat-data/**/*.cbor
-            utils/tfhe-backward-compat-data/**/*.bcode
-          key: ${{ steps.hash-lfs-data.outputs.sha }}
+          path: tests/tfhe-backward-compat-data
+          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}

      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -103,7 +103,7 @@ jobs:
    name: Unsigned integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -1,113 +0,0 @@
-name: Run noise checks on CPU
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  SLACKIFY_MARKDOWN: true
-  PULL_REQUEST_MD_LINK: ""
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-jobs:
-  setup-instance:
-    name: Setup instance (noise-checks)
-    runs-on: ubuntu-latest
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          # We want an hpc7a more compute, will be faster
-          profile: bench
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "Cannot run this without secrets"
-          exit 1
-
-  noise-checks:
-    name: CPU noise checks
-    needs: setup-instance
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
-
-      - name: Run noise checks
-        run: |
-          make test_noise_check
-
-      - name: Set pull-request URL
-        if: ${{ failure() && github.event_name == 'pull_request' }}
-        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
-        env:
-          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-      - name: Slack Notification
-        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Noise checks tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: Teardown instance (noise-checks)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, noise-checks ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (noise-checks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -104,7 +104,7 @@ jobs:
    name: Signed integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,7 +31,6 @@ on:
          - ks
          - ks_pbs
          - integer_zk
-          - hlapi_noise_squash
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -192,7 +192,7 @@ jobs:
      matrix:
        # explicit include-based build matrix, of known valid options
        include:
-          - cuda: "12.8"
+          - cuda: "12.2"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
@@ -224,7 +224,7 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
        # explicit include-based build matrix, of known valid options
        include:
-          - cuda: "12.8"
+          - cuda: "12.2"
            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -100,7 +100,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo with tags
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -101,7 +101,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo with tags
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -10,16 +10,37 @@ on:
 permissions: {}

 jobs:
-  run-benchmarks-8-h100-sxm5-integer:
-    name: Run integer benchmarks (8xH100-SXM5)
+  run-benchmarks-1-h100:
+    name: Run integer benchmarks (1xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: integer,integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-2-h100:
+    name: Run integer benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
      command: integer_multi_bit
      op_flavor: default
-      bench_type: both
+      bench_type: latency
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -31,16 +52,16 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-8-h100-sxm5-integer-compression:
-    name: Run integer compression benchmarks (8xH100-SXM5)
+  run-benchmarks-8-h100:
+    name: Run integer benchmarks (8xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
-      command: integer_compression
+      profile: multi-h100
+      hardware_name: n3-H100x8
+      command: integer_multi_bit
      op_flavor: default
-      bench_type: both
+      bench_type: latency
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -52,37 +73,16 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-8-h100-sxm5-integer-zk:
-    name: Run integer zk benchmarks (8xH100-SXM5)
+  run-benchmarks-l40:
+    name: Run integer benchmarks (L40)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
-      command: integer_zk
+      profile: l40
+      hardware_name: n3-L40x1
+      command: integer_multi_bit,integer_compression,pbs,ks
      op_flavor: default
-      bench_type: both
-      all_precisions: true
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-8-h100-sxm5-noise-squash:
-    name: Run integer zk benchmarks (8xH100-SXM5)
-    if: github.repository == 'zama-ai/tfhe-rs'
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
-      command: hlapi_noise_squash
-      op_flavor: default
-      bench_type: both
+      bench_type: latency
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -33,7 +33,6 @@ jobs:
        with:
          fetch-depth: 0
          persist-credentials: 'false'
-          lfs: true
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Get benchmark details
@@ -62,7 +61,6 @@ jobs:

      - name: Run benchmarks
        run: |
-          make pull_hpu_files
          make bench_integer_hpu
          make bench_hlapi_erc20_hpu

--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute FFT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute NTT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -49,14 +49,6 @@ jobs:
          mv linelint-linux-amd64 /usr/local/bin/linelint
          make check_newline

-      # This is needed for the ws tests clippy checks
-      - name: Use specific data branch
-        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
-        env:
-          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
-        run: |
-          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
-
      - name: Run pcc checks
        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

 permissions:
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

 permissions:
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -3,15 +3,14 @@ name: Check commit and PR compliance
 on:
  pull_request:

-permissions: {}
+permissions:
+  contents: read
+  pull-requests: read # Permission needed to scan commits in a pull-request

 jobs:
  check-commit-pr:
    name: Check commit and PR
    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: write # Permission needed to scan commits in a pull-request and write issue comment
    steps:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -0,0 +1,63 @@
+name: Close or Merge corresponding PR on the data repo
+
+# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
+
+env:
+  DATA_REPO: zama-ai/tfhe-backward-compat-data
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+  CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
+
+# only trigger on pull request closed events
+on:
+  pull_request:
+    types: [ closed ]
+
+permissions: {}
+
+jobs:
+  auto_close_job:
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
+    runs-on: ubuntu-latest
+    env:
+      GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}  # Needed for gh CLI commands
+    steps:
+    - name: Fetch PR number
+      run: |
+        PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
+        echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"
+
+    - name: Comment on the PR to indicate the reason of the close
+      run: |
+        gh pr comment "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
+      env:
+        REPO: ${{ github.repository }}
+        EVENT_NUMBER: ${{ github.event.number }}
+
+    - name: Merge the Pull Request in the data repo
+      if: ${{ github.event.pull_request.merged }}
+      run: |
+        gh pr merge "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --rebase \
+        --delete-branch
+
+    - name: Close the Pull Request in the data repo
+      if: ${{ !github.event.pull_request.merged }}
+      run: |
+        gh pr close "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --delete-branch
+
+    - name: Slack Notification
+      if: ${{ always() && job.status == 'failure' }}
+      continue-on-error: true
+      uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+      env:
+        SLACK_COLOR: ${{ job.status }}
+        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"
--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -1,148 +0,0 @@
-# Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Code Validation
-
-env:
-  CARGO_TERM_COLOR: always
-  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-  RUSTFLAGS: "-C target-cpu=native"
-  RUST_BACKTRACE: "full"
-  RUST_MIN_STACK: "8388608"
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  SLACKIFY_MARKDOWN: true
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  PULL_REQUEST_MD_LINK: ""
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
-
-on:
-  # Allows you to run this workflow manually from the Actions tab as an alternative.
-  workflow_dispatch:
-  pull_request:
-    types: [ labeled ]
-
-permissions:
-  contents: read
-
-jobs:
-  setup-instance:
-    name: Setup instance (cuda-tests)
-    runs-on: ubuntu-latest
-    if: github.event_name != 'pull_request' ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved')
-    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
-    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
-        with:
-          mode: start
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: gpu-test
-
-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
-
-  cuda-tests-linux:
-    name: CUDA Code Validation tests
-    needs: [ setup-instance ]
-    if: github.event_name != 'pull_request' ||
-      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
-    concurrency:
-      group: ${{ github.workflow_ref }}
-      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    strategy:
-      fail-fast: false
-      # explicit include-based build matrix, of known valid options
-      matrix:
-        include:
-          - os: ubuntu-22.04
-            cuda: "12.8"
-            gcc: 11 
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
-
-      - name: Find tools
-        run: |
-          find /usr -executable -name "compute-sanitizer"
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
-
-      - name: Run memory sanitizer
-        run: |
-          make test_high_level_api_gpu_debug
-
-  slack-notify:
-    name: Slack Notification
-    needs: [ setup-instance, cuda-tests-linux ]
-    runs-on: ubuntu-latest
-    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
-    continue-on-error: true
-    steps:
-      - name: Set pull-request URL
-        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
-        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
-        env:
-          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "GPU code validation tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
-
-  teardown-instance:
-    name: Teardown instance (cuda-tests)
-    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [ setup-instance, cuda-tests-linux ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Stop remote instance
-        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
-        with:
-          mode: stop
-          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
-          slab-url: ${{ secrets.SLAB_BASE_URL }}
-          job-secret: ${{ secrets.JOB_SECRET }}
-          label: ${{ needs.setup-instance.outputs.runner-name }}
-
-      - name: Slack Notification
-        if: ${{ failure() }}
-        continue-on-error: true
-        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-        env:
-          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -122,7 +122,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -107,7 +107,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -62,7 +62,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -109,7 +109,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
@@ -137,7 +137,7 @@ jobs:
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE NO_BIG_PARAMS_GPU=TRUE make test_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci

      - name: Run user docs tests
        run: |
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -11,7 +11,6 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  IS_PR: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -19,8 +18,6 @@ on:
  schedule:
    # Nightly tests will be triggered each evening 8p.m.
    - cron: "0 20 * * *"
-  pull_request:
-

 permissions:
  contents: read
@@ -58,7 +55,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    timeout-minutes: 4320 # 72 hours
    steps:
@@ -81,11 +78,7 @@ jobs:

      - name: Run tests
        run: |
-          if [[ "${IS_PR}" == "true" ]]; then
-            make test_integer_short_run_gpu
-          else
-            make test_integer_long_run_gpu
-          fi
+          make test_integer_long_run_gpu

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -109,7 +109,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -122,7 +122,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -25,6 +25,9 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
@@ -110,7 +113,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -109,7 +109,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -122,7 +122,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -25,6 +25,9 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
@@ -110,7 +113,7 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.8"
+            cuda: "12.2"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true


--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -93,9 +93,6 @@ jobs:
            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
-        env:
-          GCC_VERSION: ${{ matrix.gcc }}
-
      - name: Prepare package
        run: |
          cargo package -p tfhe-cuda-backend
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -21,7 +21,7 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: git-sync
-        uses: valtech-sd/git-sync@e734cfe9485a92e720eac5af8a4555dde5fecf88
+        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "main"
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -1,26 +0,0 @@
-name: 'Close unverified PRs'
-on:
-  schedule:
-    - cron: '30 1 * * *'
-
-permissions: {}
-
-jobs:
-  stale:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: read
-      pull-requests: write
-    steps:
-      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
-        with:
-          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
-          days-before-stale: 2
-          days-before-close: 0
-          # We are not interested in suppressing issues so have a currently non existent label
-          # if we ever accept issues to become stale/closable this label will be the signal for that
-          only-issue-labels: can-be-auto-closed
-          # Only unverified PRs are an issue
-          exempt-pr-labels: cla-signed
-          # We don't want people commenting to keep an unverified PR
-          ignore-updates: true
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,9 @@ package-lock.json
 .env
 __pycache__

+# Dir used for backward compatibility test data
+# First directive is to ignore symlinks
+tests/tfhe-backward-compat-data
 ci/

 # In case someone clones the lattice-estimator locally to verify security
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -10,7 +10,6 @@ ignore:
  - keys
  - coverage
  - utils/tfhe-lints/ui/main.stderr
-  - utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated

 rules:
  # checks if file ends in a newline character
--- a/14
+++ b/14
@@ -1,28 +1,18 @@
 # Specifying a path without code owners means that path won't have owners and is akin to a negation
 # i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
 # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
-
-/backends/tfhe-cuda-backend/            @agnesLeroy
-/backends/tfhe-hpu-backend/             @zama-ai/hardware
-
-/tfhe/examples/hpu                      @zama-ai/hardware
-
 /tfhe/src/core_crypto/                  @IceTDrinker
-/tfhe/src/core_crypto/gpu               @agnesLeroy
-/tfhe/src/core_crypto/hpu               @zama-ai/hardware
+/tfhe/src/core_crypto/gpu

 /tfhe/src/shortint/                     @mayeul-zama

 /tfhe/src/integer/                      @tmontaigu
-/tfhe/src/integer/gpu                   @agnesLeroy
-/tfhe/src/integer/hpu                   @zama-ai/hardware
+/tfhe/src/integer/gpu

 /tfhe/src/high_level_api/               @tmontaigu

 /Makefile                               @IceTDrinker @soonum

-/mockups/tfhe-hpu-mockup                @zama-ai/hardware
-
 /.github/                               @soonum

 /CODEOWNERS                             @IceTDrinker
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -170,8 +170,6 @@ On the contrary, these changes are *not* data breaking:
 * Renaming a type (unless it implements the `Named` trait).
 * Adding a variant to the end of an enum.

-Historical data from previous TFHE-rs versions are stored inside `utils/tfhe-backward-compat-data`. They are used to check on every PR that backward compatibility has been preserved.
-
 ## Example: adding a field

 Suppose you want to add an i32 field to a type named `MyType`. The original type is defined as:
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
 ]

 exclude = [
-    "utils/tfhe-backward-compat-data",
+    "tests/backward_compatibility_tests",
    "utils/tfhe-lints",
    "apps/trivium",
 ]
--- a/129
+++ b/129
@@ -22,7 +22,10 @@ BENCH_TYPE?=latency
 BENCH_PARAM_TYPE?=classical
 BENCH_PARAMS_SET?=default
 NODE_VERSION=22.6
-BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
+BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
+BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
+BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
+BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
 WASM_PACK_VERSION="0.13.1"
 # We are kind of hacking the cut here, the version cannot contain a quote '"'
@@ -156,23 +159,23 @@ install_tarpaulin: install_rs_build_toolchain

 .PHONY: install_cargo_dylint # Install custom tfhe-rs lints
 install_cargo_dylint:
-	cargo install --locked cargo-dylint dylint-link
+	cargo install cargo-dylint dylint-link

 .PHONY: install_typos_checker # Install typos checker
 install_typos_checker: install_rs_build_toolchain
 	@typos --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked typos-cli || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
 	( echo "Unable to install typos-cli, unknown error." && exit 1 )

 .PHONY: install_zizmor # Install zizmor workflow security checker
 install_zizmor: install_rs_build_toolchain
 	@zizmor --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked zizmor --version ~1.9 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
 	( echo "Unable to install zizmor, unknown error." && exit 1 )

-.PHONY: install_cargo_cross # Install cross for big endian tests
+.PHONY: install_cargo_cross # Install custom tfhe-rs lints
 install_cargo_cross: install_rs_build_toolchain
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked cross
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross

 .PHONY: setup_venv # Setup Python virtualenv for wasm tests
 setup_venv:
@@ -249,9 +252,6 @@ install_mlc: install_rs_build_toolchain
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt

 .PHONY: fmt_js # Format javascript code
 fmt_js: check_nvm_installed
@@ -273,9 +273,6 @@ fmt_c_tests:
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check

 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
@@ -486,22 +483,10 @@ clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p param_dedup -- --no-deps -D warnings

-.PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
-clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
-	@# Some old crates are x86 specific, only run in that case
-	@if uname -a | grep -q x86; then \
-		RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-			-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all-targets \
-			-- --no-deps -D warnings; \
-	else \
-		echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
-	fi
-
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
-clippy_backward_compat_data
+clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -517,12 +502,6 @@ clippy_hpu_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-hpu-backend -- --no-deps -D warnings

-.PHONY: clippy_hpu_mockup # Run clippy lints on tfhe-hpu-mockup
-clippy_hpu_mockup: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--all-targets \
-		-p tfhe-hpu-mockup -- --no-deps -D warnings
-
 .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
 check_rust_bindings_did_not_change:
 	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -535,9 +514,6 @@ check_rust_bindings_did_not_change:
 tfhe_lints: install_cargo_dylint
 	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe --no-deps -- \
 		--features=boolean,shortint,integer,strings,zk-pok
-	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe-zk-pok --no-deps -- \
-		--features=experimental
-

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -679,30 +655,6 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

-.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
-test_integer_gpu_debug: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release_lto_off \
-		--features=integer,gpu-debug -vv -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1 --nocapture
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
-		--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
-
-.PHONY: test_high_level_api_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
-test_high_level_api_gpu_debug: install_rs_build_toolchain install_cargo_nextest
-	export RUSTFLAGS="$(RUSTFLAGS)" && \
-	export CARGO_RS_BUILD_TOOLCHAIN="$(CARGO_RS_BUILD_TOOLCHAIN)" && \
-	export TFHE_SPEC="$(TFHE_SPEC)" && \
-	export CARGO_PROFILE="$(CARGO_PROFILE)" &&	scripts/check_memory_errors.sh
-
-.PHONY: test_integer_hl_test_gpu_check_warnings
-test_integer_hl_test_gpu_check_warnings: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build \
-		--features=integer,internal-keycache,gpu-debug,zk-pok -vv -p $(TFHE_SPEC) &> /tmp/gpu_compile_output
-	WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning:" | grep "\[tfhe-cuda-backend" | grep -v "inline function" || true) && \
-	if [[ "$${WARNINGS}" != "" ]]; then \
-		echo "$${WARNINGS}" && exit 1; \
-	fi
-
-
 .PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
 test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -711,12 +663,6 @@ test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
 		--tfhe-package "$(TFHE_SPEC)" --backend "gpu"

-.PHONY: test_integer_short_run_gpu # Run the long run integer tests on the gpu backend
-test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
-	TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE \
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture
-
 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -927,15 +873,6 @@ test_integer_long_run: install_rs_check_toolchain install_cargo_nextest
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
 		--tfhe-package "$(TFHE_SPEC)"

-.PHONY: test_noise_check # Run dedicated noise and pfail check tests
-test_noise_check: install_rs_check_toolchain
-	@# First run the sanity checks to make sure the atomic patterns are correct
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,nightly-avx512 -p $(TFHE_SPEC) -- sanity_check
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,nightly-avx512 -p $(TFHE_SPEC) -- noise_check \
-		--test-threads=1
-
 .PHONY: test_safe_serialization # Run the tests for safe serialization
 test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -1088,11 +1025,16 @@ test_tfhe_lints: install_cargo_dylint
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
 .PHONY: test_backward_compatibility_ci
 test_backward_compatibility_ci: install_rs_build_toolchain
-	TFHE_BACKWARD_COMPAT_DATA_DIR="../$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tests/$(BACKWARD_COMPAT_DATA_DIR)\"" \
 		--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
-test_backward_compatibility: pull_backward_compat_data test_backward_compatibility_ci
+test_backward_compatibility: tests/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
+
+.PHONY: backward_compat_branch # Prints the required backward compatibility branch
+backward_compat_branch:
+	@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"

 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
@@ -1137,10 +1079,6 @@ check_intra_md_links: install_mlc
 check_md_links: install_mlc
 	mlc --match-file-extension tfhe/docs

-.PHONY: check_doc_paths_use_dash # Check paths use "-" instead of "_" in docs for gitbook compatibility
-check_doc_paths_use_dash:
-	python3 ./scripts/check_doc_paths_use_dash.py
-
 .PHONY: check_parameter_export_ok # Checks exported "current" shortint parameter module is correct
 check_parameter_export_ok:
 	python3 ./scripts/check_current_param_export.py
@@ -1498,20 +1436,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

-.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
-bench_hlapi_noise_squash: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
-
-.PHONY: bench_hlapi_noise_squash_gpu # Run benchmarks for noise squash operation on GPU
-bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
-
 #
 # Utility tools
 #
@@ -1569,13 +1493,11 @@ write_params_to_file: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
 	--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache

-.PHONY: pull_backward_compat_data # Pull the data files needed for backward compatibility tests
-pull_backward_compat_data:
-	./scripts/pull_lfs_data.sh $(BACKWARD_COMPAT_DATA_DIR)
+.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
+clone_backward_compat_data:
+	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tests/$(BACKWARD_COMPAT_DATA_DIR)

-.PHONY: pull_hpu_files # Pull the hpu files
-pull_hpu_files:
-	./scripts/pull_lfs_data.sh backends/tfhe-hpu-backend/
+tests/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data

 #
 # Real use case examples
@@ -1601,20 +1523,19 @@ sha256_bool: install_rs_check_toolchain

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
-clippy_all check_compile_tests test_tfhe_lints \
+check_md_docs_are_tested check_intra_md_links clippy_all check_compile_tests test_tfhe_lints \
 tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
-clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu test_integer_hl_test_gpu_check_warnings
+clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu

 .PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
-pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
+pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash clippy_fast check_compile_tests
+check_md_docs_are_tested clippy_fast check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt fmt_js
--- a/README.md
+++ b/README.md
@@ -18,7 +18,6 @@
  <a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
  <a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
  <a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
-  <a href="https://slsa.dev"><img alt="SLSA 3" src="https://slsa.dev/images/gh-badge-level3.svg" /></a>
 </p>

 ## About
@@ -149,7 +148,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
@@ -204,7 +203,7 @@ When a new update is published in the Lattice Estimator, we update parameters ac
 By default, the parameter sets used in the High-Level API with the x86 CPU backend have a failure probability $\le 2^{128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
 If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).

-The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-128}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].
+The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-64}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].

 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf

--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -137,10 +137,10 @@ its setup a little more intricate.
 Example code:
 ```rust
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,9 +1,9 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,9 +1,9 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.11.0"
+version = "0.10.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -19,4 +19,3 @@ bindgen = "0.71"
 [features]
 experimental-multi-arch = []
 profile = []
-debug = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -53,10 +53,6 @@ fn main() {
            cmake_config.define("USE_NVTOOLS", "OFF");
        }

-        if cfg!(feature = "debug") {
-            cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
-        }
-
        // Build the CMake project
        let dest = cmake_config.build();
        println!("cargo:rustc-link-search=native={}", dest.display());
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -52,8 +52,6 @@ endif()

 if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Release)
-else()
-  message("Building CUDA backend in ${CMAKE_BUILD_TYPE}")
 endif()

 # Add OpenMP support
@@ -78,10 +76,8 @@ endif()

 add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

-string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWERCASE)
-
 # Check if the DEBUG flag is defined
-if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
  # Debug mode
  message("Compiling in Debug mode")
  add_definitions(-DDEBUG)
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -37,6 +37,17 @@ void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
    uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
+void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
+                                      void *lwe_array_out, void *lwe_array_in,
+                                      uint32_t size, uint32_t log_modulus,
+                                      uint32_t degree,
+                                      uint32_t grouping_factor);
+
+void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
+                                       void *lwe_array_out, void *lwe_array_in,
+                                       uint32_t size, uint32_t log_modulus,
+                                       uint32_t degree,
+                                       uint32_t grouping_factor);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -26,7 +26,6 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

-uint32_t cuda_get_device();
 void cuda_set_device(uint32_t gpu_index);

 cudaEvent_t cuda_create_event(uint32_t gpu_index);
@@ -50,13 +49,12 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index);

 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
+                                           uint64_t *size_tracker,
                                           bool allocate_gpu_memory);

 void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
-uint64_t cuda_device_total_memory(uint32_t gpu_index);

 void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 uint64_t size,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -40,6 +40,11 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(
 void cleanup_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);
-}

+void cuda_integer_extract_glwe_64(
+    void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
+    void const *glwe_list, uint32_t const glwe_index,
+    uint32_t const log_modulus, uint32_t const polynomial_size,
+    uint32_t const glwe_dimension, uint32_t const body_count);
+}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -20,7 +20,7 @@ template <typename Torus> struct int_compression {
                  uint32_t gpu_count, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                  uint64_t &size_tracker) {
+                  uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
    this->lwe_per_glwe = lwe_per_glwe;
@@ -38,7 +38,7 @@ template <typename Torus> struct int_compression {
        lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
        gpu_indexes[0], size_tracker, allocate_gpu_memory);

-    size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    *size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
        streams[0], gpu_indexes[0], &fp_ks_buffer,
        compression_params.small_lwe_dimension,
        compression_params.glwe_dimension, compression_params.polynomial_size,
@@ -76,7 +76,7 @@ template <typename Torus> struct int_decompression {
                    int_radix_params compression_params,
                    uint32_t num_radix_blocks, uint32_t body_count,
                    uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                    uint64_t &size_tracker) {
+                    uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
@@ -124,7 +124,7 @@ template <typename Torus> struct int_decompression {
        encryption_params.carry_modulus, decompression_rescale_f,
        gpu_memory_allocated);

-    decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
+    decompression_rescale_lut->broadcast_lut(streams, gpu_indexes, 0);
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -48,34 +48,6 @@ typedef struct {
  uint32_t lwe_dimension;
 } CudaRadixCiphertextFFI;

-typedef struct {
-  uint64_t const *chosen_multiplier_has_at_least_one_set;
-  uint64_t const *decomposed_chosen_multiplier;
-
-  uint32_t const num_scalars;
-  uint32_t const active_bits;
-  uint64_t const shift_pre;
-  uint32_t const shift_post;
-  uint32_t const ilog2_chosen_multiplier;
-  uint32_t const chosen_multiplier_num_bits;
-
-  bool const is_chosen_multiplier_zero;
-  bool const is_abs_chosen_multiplier_one;
-  bool const is_chosen_multiplier_negative;
-  bool const is_chosen_multiplier_pow2;
-  bool const chosen_multiplier_has_more_bits_than_numerator;
-  // if signed: test if chosen_multiplier >= 2^{num_bits - 1}
-  bool const is_chosen_multiplier_geq_two_pow_numerator;
-
-  uint32_t const ilog2_divisor;
-
-  bool const is_divisor_zero;
-  bool const is_abs_divisor_one;
-  bool const is_divisor_negative;
-  bool const is_divisor_pow2;
-  bool const divisor_has_more_bits_than_numerator;
-} CudaScalarDivisorFFI;
-
 uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -423,8 +395,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -443,8 +414,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -577,6 +547,27 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
                              void *const *streams,
                              uint32_t const *gpu_indexes);

+uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
+    bool allocate_ms_array);
+
+void cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
+    uint64_t rhs, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars);
+
+void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
 uint64_t scratch_cuda_apply_noise_squashing_kb(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -599,6 +590,9 @@ void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
                                           uint32_t const *gpu_indexes,
                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void);
+void cuda_small_scalar_multiplication_integer_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t scalar);

 uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -621,115 +615,5 @@ void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
                                                 uint32_t const *gpu_indexes,
                                                 uint32_t gpu_count,
                                                 int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array);
-
-void cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi);
-
-void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t num_additional_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
-
-void cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
-    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
-    void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
-
-void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array);
-
-void cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
-
-void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array);
-
-void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    void const *clear_blocks, void const *h_clear_blocks,
-    uint32_t num_clear_blocks);
-
-void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array);
-
-void cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    uint32_t numerator_bits);
-
-void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -65,6 +65,9 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
    void const *lwe_array_in, const uint64_t plaintext_in,
    const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);
+void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void const *plaintext_array_in, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 }
-
 #endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -66,9 +66,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
-uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
-    uint32_t polynomial_size);
-template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
    uint32_t polynomial_size);
 template <typename Torus>
@@ -98,12 +95,8 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(

 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size,
-                            uint64_t full_sm_keybundle);
-template <typename Torus, class params>
-uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
-                                uint32_t polynomial_size,
-                                uint64_t full_sm_keybundle);
+                            uint32_t polynomial_size);
+
 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
@@ -122,7 +115,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             uint64_t &size_tracker) {
+             uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);

@@ -288,146 +281,4 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  }
 };

-template <typename InputTorus>
-struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
-  int8_t *d_mem_keybundle = NULL;
-  int8_t *d_mem_acc_step_one = NULL;
-  int8_t *d_mem_acc_step_two = NULL;
-  int8_t *d_mem_acc_cg = NULL;
-  int8_t *d_mem_acc_tbc = NULL;
-  uint32_t lwe_chunk_size;
-  double *keybundle_fft;
-  __uint128_t *global_accumulator;
-  double *global_join_buffer;
-
-  PBS_VARIANT pbs_variant;
-  bool gpu_memory_allocated;
-
-  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-                 uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
-                 bool allocate_gpu_memory, uint64_t &size_tracker) {
-    gpu_memory_allocated = allocate_gpu_memory;
-    cuda_set_device(gpu_index);
-
-    this->pbs_variant = pbs_variant;
-    this->lwe_chunk_size = lwe_chunk_size;
-    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-
-    // default
-    uint64_t full_sm_keybundle =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
-            __uint128_t>(polynomial_size);
-    uint64_t full_sm_accumulate_step_one =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
-            __uint128_t>(polynomial_size);
-    uint64_t full_sm_accumulate_step_two =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
-            __uint128_t>(polynomial_size);
-    uint64_t partial_sm_accumulate_step_one =
-        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
-            __uint128_t>(polynomial_size);
-    // cg
-    uint64_t full_sm_cg_accumulate =
-        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
-            polynomial_size);
-    uint64_t partial_sm_cg_accumulate =
-        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
-            __uint128_t>(polynomial_size);
-
-    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
-                                (glwe_dimension + 1) * (glwe_dimension + 1) *
-                                level_count;
-    auto num_blocks_acc_step_one =
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
-    auto num_blocks_acc_step_two =
-        input_lwe_ciphertext_count * (glwe_dimension + 1);
-    auto num_blocks_acc_cg =
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
-
-    // Keybundle
-    if (max_shared_memory < full_sm_keybundle)
-      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
-          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
-          size_tracker, allocate_gpu_memory);
-
-    switch (pbs_variant) {
-    case PBS_VARIANT::CG:
-      // Accumulator CG
-      if (max_shared_memory < partial_sm_cg_accumulate)
-        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
-            size_tracker, allocate_gpu_memory);
-      else if (max_shared_memory < full_sm_cg_accumulate)
-        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
-            size_tracker, allocate_gpu_memory);
-      break;
-    case PBS_VARIANT::DEFAULT:
-      // Accumulator step one
-      if (max_shared_memory < partial_sm_accumulate_step_one)
-        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
-      else if (max_shared_memory < full_sm_accumulate_step_one)
-        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
-
-      // Accumulator step two
-      if (max_shared_memory < full_sm_accumulate_step_two)
-        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
-      break;
-    default:
-      PANIC("Cuda error (PBS): unsupported implementation variant.")
-    }
-
-    keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
-        num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
-        stream, gpu_index, size_tracker, allocate_gpu_memory);
-    global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
-        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
-            sizeof(__uint128_t),
-        stream, gpu_index, size_tracker, allocate_gpu_memory);
-    global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
-            (polynomial_size / 2) * 4 * sizeof(double),
-        stream, gpu_index, size_tracker, allocate_gpu_memory);
-  }
-
-  void release(cudaStream_t stream, uint32_t gpu_index) {
-
-    if (d_mem_keybundle)
-      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
-                                         gpu_memory_allocated);
-    switch (pbs_variant) {
-    case DEFAULT:
-      if (d_mem_acc_step_one)
-        cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
-                                           gpu_index, gpu_memory_allocated);
-      if (d_mem_acc_step_two)
-        cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
-                                           gpu_index, gpu_memory_allocated);
-      break;
-    case CG:
-      if (d_mem_acc_cg)
-        cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
-                                           gpu_memory_allocated);
-      break;
-    default:
-      PANIC("Cuda error (PBS): unsupported implementation variant.")
-    }
-
-    cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
-                                       gpu_memory_allocated);
-  }
-};
-
 #endif // CUDA_MULTI_BIT_UTILITIES_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -90,7 +90,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
             uint32_t glwe_dimension, uint32_t polynomial_size,
             uint32_t level_count, uint32_t input_lwe_ciphertext_count,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             bool allocate_ms_array, uint64_t &size_tracker) {
+             bool allocate_ms_array, uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->uses_noise_reduction = allocate_ms_array;
@@ -240,10 +240,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

-template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
-
-template <typename InputTorus>
-struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
+template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
@@ -260,7 +257,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
                 uint32_t polynomial_size, uint32_t level_count,
                 uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
                 bool allocate_gpu_memory, bool allocate_ms_array,
-                 uint64_t &size_tracker) {
+                 uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -15,11 +15,6 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size, uint32_t grouping_factor);
-
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -38,25 +33,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
-                                                       const uint32_t gpu_index,
-                                                       int8_t **buffer);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -10,8 +10,8 @@ extern "C" {
 void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
                        void *lwe_array_out, const void *lwe_compact_array_in,
                        uint32_t lwe_dimension, uint32_t num_lwe,
-                        const uint32_t *lwe_compact_input_indexes,
-                        const uint32_t *output_body_id_per_compact_list);
+                        const void *lwe_compact_input_indexes,
+                        const void *output_body_id_per_compact_list);

 uint64_t scratch_cuda_expand_without_verification_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -27,7 +27,7 @@ template <typename Torus> struct zk_expand_mem {
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
                const bool *is_boolean_array, uint32_t num_compact_lists,
-                bool allocate_gpu_memory, uint64_t &size_tracker)
+                bool allocate_gpu_memory, uint64_t *size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
        casting_key_type(casting_key_type) {
@@ -232,7 +232,7 @@ template <typename Torus> struct zk_expand_mem {
        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
        allocate_gpu_memory);

-    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes);
+    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);

    // The expanded LWEs will always be on the casting key format
    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,6 +1,5 @@
 file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
-                                     pbs/programmable_bootstrap_multibit_128.cu)
+add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1,5 +1,6 @@
 #include "ciphertext.cuh"
 #include "polynomial/parameters.cuh"
+#include "torus.cuh"

 void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
@@ -142,3 +143,29 @@ void cuda_glwe_sample_extract_128(
          "N's are powers of two in the interval [256..4096].")
  }
 }
+
+void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
+                                      void *lwe_array_out, void *lwe_array_in,
+                                      uint32_t size, uint32_t log_modulus,
+                                      uint32_t degree,
+                                      uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
+
+void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
+                                       void *lwe_array_out, void *lwe_array_in,
+                                       uint32_t size, uint32_t log_modulus,
+                                       uint32_t degree,
+                                       uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<__uint128_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<__uint128_t *>(lwe_array_out),
+      static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -38,16 +38,6 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 // Each thread in x are used to calculate one output.
 // threads in y are used to paralelize the lwe_dimension_in loop.
 // shared memory is used to store intermediate results of the reduction.
-// Note: To reduce register pressure we have slightly changed the algorithm,
-// the idea consists in calculating the negate value of the output. So, instead
-// of accumulating subtractions using -=, we accumulate additions using += in
-// the local_lwe_out. This seems to work better cause profits madd ops and save
-// some regs. For this to work, we need to negate the input
-// lwe_array_in[lwe_dimension_in], and negate back the output at the end to get
-// the correct results. Additionally, we split the calculation of the ksk offset
-// in two parts, a constant part is calculated before the loop, and a variable
-// part is calculated inside the loop. This seems to help with the register
-// pressure as well.
 template <typename Torus>
 __global__ void
 keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
@@ -70,7 +60,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);

    if (tid == lwe_dimension_out && threadIdx.y == 0) {
-      local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
+      local_lwe_out = block_lwe_array_in[lwe_dimension_in];
    }
    const Torus mask_mod_b = (1ll << base_log) - 1ll;

@@ -83,12 +73,12 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    for (int i = start_i; i < end_i; i++) {
      Torus state =
          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
-      uint32_t offset = i * level_count * (lwe_dimension_out + 1);
-      for (int j = 0; j < level_count; j++) {

+      for (int j = 0; j < level_count; j++) {
+        auto ksk_block =
+            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-        local_lwe_out +=
-            (Torus)ksk[tid + j * (lwe_dimension_out + 1) + offset] * decomposed;
+        local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
      }
    }

@@ -103,7 +93,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
          lwe_acc_out[shmem_index + offset * blockDim.x];
    }
    if (threadIdx.y == 0)
-      block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
+      block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
  }
 }

@@ -125,7 +115,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(

  int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
  if (num_blocks_per_sample > 65536)
-    PANIC("Cuda error (Keyswitch): number of blocks per sample is too large");
+    PANIC("Cuda error (Keyswith): number of blocks per sample is too large");

  // In multiplication of large integers (512, 1024, 2048), the number of
  // samples can be larger than 65536, so we need to set it in the first
@@ -186,10 +176,10 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(
                             ? glwe_accumulator_size
                             : lwe_dimension * 2;

-  uint64_t size_tracker = 0;
+  uint64_t size_tracker;
  uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
  *fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
-      buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
+      buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -66,13 +66,6 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
  r = __ll2double_rn(x);
 }

-template <>
-__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
-                                                             double &r) {
-  // We truncate x
-  r = __ll2double_rn(static_cast<uint64_t>(x));
-}
-
 template <typename T>
 __device__ inline T init_decomposer_state(T input, uint32_t base_log,
                                          uint32_t level_count) {
@@ -302,4 +295,87 @@ __host__ void host_improve_noise_modulus_switch(
  check_cuda_error(cudaGetLastError());
 }

+template <typename Torus, class params>
+__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
+                                               uint32_t ggsw_idx,
+                                               uint32_t grouping_factor) {
+  Torus x = 0;
+  for (int i = 0; i < grouping_factor; i++) {
+    uint32_t mask_position = grouping_factor - (i + 1);
+    int selection_bit = (ggsw_idx >> mask_position) & 1;
+    x += selection_bit * lwe_array_group[i];
+  }
+
+  return modulus_switch(x, params::log2_degree + 1);
+}
+
+template <typename Torus, class params>
+__global__ void
+modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
+                         uint32_t log_modulus, uint32_t grouping_factor) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    int num_monomials = 1 << grouping_factor;
+    int input_offset = tid * grouping_factor;
+    int output_offset = tid * (num_monomials - 1); // First monomial is skipped
+    for (int ggsw_idx = 1; ggsw_idx < num_monomials; ggsw_idx++) {
+      array_out[ggsw_idx - 1 + output_offset] =
+          calculates_monomial_degree<Torus, params>(&array_in[input_offset],
+                                                    ggsw_idx, grouping_factor);
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_modulus_switch_multi_bit(
+    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
+    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
+  cudaSetDevice(gpu_index);
+  int multibit_size = size / grouping_factor;
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
+  switch (degree) {
+  case 256:
+    modulus_switch_multi_bit<Torus, Degree<256>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 512:
+    modulus_switch_multi_bit<Torus, Degree<512>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 1024:
+    modulus_switch_multi_bit<Torus, Degree<1024>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 2048:
+    modulus_switch_multi_bit<Torus, Degree<2048>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 4096:
+    modulus_switch_multi_bit<Torus, Degree<4096>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 8192:
+    modulus_switch_multi_bit<Torus, Degree<8192>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 16384:
+    modulus_switch_multi_bit<Torus, Degree<16384>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
+  };
+
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -2,12 +2,6 @@
 #include <cstdint>
 #include <cuda_runtime.h>

-uint32_t cuda_get_device() {
-  int device;
-  check_cuda_error(cudaGetDevice(&device));
-  return static_cast<uint32_t>(device);
-}
-
 void cuda_set_device(uint32_t gpu_index) {
  check_cuda_error(cudaSetDevice(gpu_index));
 }
@@ -80,9 +74,10 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
+                                           uint64_t *size_tracker,
                                           bool allocate_gpu_memory) {
-  size_tracker += size;
+  if (size_tracker != nullptr)
+    *size_tracker += size;
  void *ptr = nullptr;
  if (!allocate_gpu_memory)
    return ptr;
@@ -111,9 +106,8 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                        uint32_t gpu_index) {
-  uint64_t size_tracker = 0;
-  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
-                                              size_tracker, true);
+  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index, nullptr,
+                                              true);
 }

 /// Check that allocation is valid
@@ -128,13 +122,6 @@ bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
  }
 }

-uint64_t cuda_device_total_memory(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
-  size_t total_mem = 0, free_mem = 0;
-  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
-  return total_mem;
-}
-
 /// Returns
 ///  false if Cooperative Groups is not supported.
 ///  true otherwise
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -234,29 +234,6 @@ __device__ void convert_u128_to_f128_as_torus(
  }
 }

-// params is expected to be full degree not half degree
-// same as convert_u128_to_f128_as_torus() but expects input to be on registers
-template <class params>
-__device__ void convert_u128_on_regs_to_f128_as_torus(
-    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
-    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
-
-  const double normalization = pow(2., -128.);
-  Index tid = threadIdx.x;
-  // #pragma unroll
-  for (Index i = 0; i < params::opt / 2; i++) {
-    auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
-    auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
-
-    out_re_hi[tid] = out_re.hi * normalization;
-    out_re_lo[tid] = out_re.lo * normalization;
-    out_im_hi[tid] = out_im.hi * normalization;
-    out_im_lo[tid] = out_im.lo * normalization;
-
-    tid += params::degree / params::opt;
-  }
-}
-
 template <class params>
 __device__ void
 convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -295,7 +272,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standard input into complex<128> represented by 4 double
+// converts standqard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
 // ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n,  im_lo_0, im_lo_1, ..., im_lo_n]
 template <class params>
@@ -314,7 +291,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standard input into complex<128> represented by 4 double
+// converts standqard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
 // re_lo_1, im_hi_1, im_lo_1,
 // ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
  if (is_signed) {
    *mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
                                         params, num_blocks,
-                                         allocate_gpu_memory, size_tracker);
+                                         allocate_gpu_memory, &size_tracker);
  }
  return size_tracker;
 }
@@ -53,8 +53,7 @@ __host__ void host_integer_abs_kb(
      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                       ct->num_radix_blocks, mem_ptr->params.message_modulus,
-                       mem_ptr->params.carry_modulus);
+                       ct->num_radix_blocks);

  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -61,7 +61,7 @@ __host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
                                         params, num_radix_blocks,
-                                         allocate_gpu_memory, size_tracker);
+                                         allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -15,48 +15,3 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
  host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
                                       gpu_indexes);
 }
-
-uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t num_additional_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_extend_radix_with_sign_msb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count,
-      (int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
-      num_blocks, num_additional_blocks, allocate_gpu_memory);
-}
-
-void cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
-    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
-    void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
-  host_extend_radix_with_sign_msb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
-      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
-      num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
-}
-
-void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void) {
-
-  int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
-      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -33,62 +33,4 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
      input_start_lwe_index, input->num_radix_blocks);
 }

-template <typename Torus>
-__host__ uint64_t scratch_extend_radix_with_sign_msb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
-    const int_radix_params params, uint32_t num_radix_blocks,
-    uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      num_additional_blocks, allocate_gpu_memory, size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_extend_radix_with_sign_msb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI const *input,
-    int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
-    uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
-  if (num_additional_blocks == 0) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
-                                       input);
-    return;
-  }
-
-  const uint32_t input_blocks = input->num_radix_blocks;
-
-  if (input_blocks == 0) {
-    PANIC("Cuda error: input blocks cannot be zero");
-  }
-
-  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
-                                           0, input_blocks, input, 0,
-                                           input_blocks);
-
-  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                           mem_ptr->last_block, 0, 1, input,
-                                           input_blocks - 1, input_blocks);
-
-  host_apply_univariate_lut_kb(
-      streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
-      mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
-
-  for (uint32_t i = 0; i < num_additional_blocks; ++i) {
-    uint32_t dst_block_idx = input_blocks + i;
-
-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
-                                             dst_block_idx, dst_block_idx + 1,
-                                             mem_ptr->padding_block, 0, 1);
-  }
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -17,12 +17,11 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };

-  uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
+  return scratch_cuda_integer_radix_cmux_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
  POP_RANGE()
-  return ret;
 }

 void cuda_cmux_integer_radix_ciphertext_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -84,8 +84,7 @@ __host__ void host_integer_radix_cmux_kb(
                                   num_radix_blocks, 2 * num_radix_blocks);

  host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
-                       &mem_false, num_radix_blocks, params.message_modulus,
-                       params.carry_modulus);
+                       &mem_false, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
@@ -101,7 +100,7 @@ __host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_cmux_buffer<Torus>(
      streams, gpu_indexes, gpu_count, predicate_lut_f, params,
-      num_radix_blocks, allocate_gpu_memory, size_tracker);
+      num_radix_blocks, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -148,7 +148,7 @@ __host__ void are_all_comparisons_block_true(
        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
                                 h_lut_indexes, num_chunks * sizeof(Torus),
                                 streams[0], gpu_indexes[0]);
-        is_max_value_lut->broadcast_lut(streams, gpu_indexes);
+        is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
      }
      lut = is_max_value_lut;
    }
@@ -167,7 +167,7 @@ __host__ void are_all_comparisons_block_true(
                               is_max_value_lut->h_lut_indexes,
                               is_max_value_lut->num_blocks * sizeof(Torus),
                               streams[0], gpu_indexes[0]);
-      is_max_value_lut->broadcast_lut(streams, gpu_indexes);
+      is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
      reset_radix_ciphertext_blocks(lwe_array_out, 1);
      return;
    } else {
@@ -499,7 +499,7 @@ __host__ void tree_sign_reduction(
      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
      polynomial_size, message_modulus, carry_modulus, f, true);
-  last_lut->broadcast_lut(streams, gpu_indexes);
+  last_lut->broadcast_lut(streams, gpu_indexes, 0);

  // Last leaf
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
@@ -684,7 +684,7 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_comparison_buffer<Torus>(
      streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -89,3 +89,16 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_64(
      (int_decompression<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void cuda_integer_extract_glwe_64(
+    void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
+    void const *glwe_list, uint32_t const glwe_index,
+    uint32_t const log_modulus, uint32_t const polynomial_size,
+    uint32_t const glwe_dimension, uint32_t const body_count) {
+  auto casted_streams = (cudaStream_t *)(streams);
+
+  host_extract_mem_alloc_free<uint64_t>(
+      casted_streams[0], gpu_indexes[0], static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(glwe_list), glwe_index, log_modulus,
+      polynomial_size, glwe_dimension, body_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -231,6 +231,84 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  check_cuda_error(cudaGetLastError());
 }

+/// Extracts the glwe_index-nth GLWE ciphertext
+/// This function does not require memory allocation
+template <typename Torus>
+__host__ void host_extract_mem_alloc_free(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_array_out,
+    Torus const *array_in, const uint32_t glwe_index,
+    const uint32_t log_modulus, const uint32_t polynomial_size,
+    const uint32_t glwe_dimension, const uint32_t body_count_in) {
+  if (array_in == glwe_array_out)
+    PANIC("Cuda error: Input and output must be different");
+
+  cuda_set_device(gpu_index);
+
+  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
+  auto num_glwes = (body_count_in + polynomial_size - 1) / polynomial_size;
+
+  // Compressed length of the compressed GLWE we want to extract
+  uint32_t body_count = 0;
+  if (body_count_in % polynomial_size == 0)
+    body_count = polynomial_size;
+  else if (glwe_index == num_glwes - 1)
+    body_count = body_count_in % polynomial_size;
+  else
+    body_count = polynomial_size;
+
+  auto initial_out_len = glwe_dimension * polynomial_size + body_count;
+  // Calculates how many bits this particular GLWE shall use
+  auto number_bits_to_unpack = initial_out_len * log_modulus;
+  auto nbits = sizeof(Torus) * 8;
+  auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+
+  // Calculates how many bits a full-packed GLWE shall use
+  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
+  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
+  // Uses that length to set the input pointer
+  auto chunk_array_in = array_in + glwe_index * len;
+  // Ensure the tail of the GLWE is zeroed
+  if (initial_out_len < glwe_ciphertext_size) {
+    auto zeroed_slice = glwe_array_out + initial_out_len;
+    cuda_memset_async(glwe_array_out, 0,
+                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
+                      stream, gpu_index);
+  }
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, chunk_array_in,
+                                               log_modulus, initial_out_len);
+
+  // uint32_t body_count = std::min(body_count_in, polynomial_size);
+  // auto initial_out_len = glwe_dimension * polynomial_size + body_count;
+
+  // auto compressed_glwe_accumulator_size =
+  //     (glwe_dimension + 1) * polynomial_size;
+
+  // auto number_bits_to_unpack = compressed_glwe_accumulator_size *
+  // log_modulus; auto nbits = sizeof(Torus) * 8;
+  // // number_bits_to_unpack.div_ceil(Scalar::BITS)
+  // auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+
+  // // We assure the tail of the glwe is zeroed
+  // auto zeroed_slice = glwe_array_out + initial_out_len;
+  // cuda_memset_async(zeroed_slice, 0,
+  //                   (polynomial_size - body_count) * sizeof(Torus),
+  //                   streams[0], gpu_indexes[0]);
+  // int num_blocks = 0, num_threads = 0;
+  // getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  // dim3 grid(num_blocks);
+  // dim3 threads(num_threads);
+  // extract<Torus><<<grid, threads, 0, streams[0]>>>(glwe_array_out, array_in,
+  //                                                  glwe_index, log_modulus,
+  //                                                  input_len,
+  //                                                  initial_out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename Torus>
 __host__ void host_integer_decompress(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -370,7 +448,7 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_compression<Torus>(
      streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
-      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -386,7 +464,7 @@ __host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
  *mem_ptr = new int_decompression<Torus>(
      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
      num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
-      size_tracker);
+      &size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -28,7 +28,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
                                           params, is_signed, num_blocks,
-                                           allocate_gpu_memory, size_tracker);
+                                           allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -268,11 +268,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // but in that position, interesting_remainder2 always has a 0
    auto merged_interesting_remainder = interesting_remainder1;

-    host_addition<Torus>(
-        streams[0], gpu_indexes[0], merged_interesting_remainder,
-        merged_interesting_remainder, interesting_remainder2,
-        merged_interesting_remainder->num_radix_blocks,
-        radix_params.message_modulus, radix_params.carry_modulus);
+    host_addition<Torus>(streams[0], gpu_indexes[0],
+                         merged_interesting_remainder,
+                         merged_interesting_remainder, interesting_remainder2,
+                         merged_interesting_remainder->num_radix_blocks);

    // after create_clean_version_of_merged_remainder
    // `merged_interesting_remainder` will be reused as
@@ -383,10 +382,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    host_addition<Torus>(
-        streams[0], gpu_indexes[0], overflow_sum, subtraction_overflowed,
-        at_least_one_upper_block_is_non_zero, 1, radix_params.message_modulus,
-        radix_params.carry_modulus);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum,
+                         subtraction_overflowed,
+                         at_least_one_upper_block_is_non_zero, 1);

    auto message_modulus = radix_params.message_modulus;
    int factor = (i) ? message_modulus - 1 : message_modulus - 2;
@@ -436,9 +434,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      as_radix_ciphertext_slice<Torus>(&quotient_block, quotient, block_of_bit,
                                       block_of_bit + 1);
      host_addition<Torus>(streams[0], gpu_indexes[0], &quotient_block,
-                           &quotient_block, mem_ptr->did_not_overflow, 1,
-                           radix_params.message_modulus,
-                           radix_params.carry_modulus);
+                           &quotient_block, mem_ptr->did_not_overflow, 1);
    };

    for (uint j = 0; j < gpu_count; j++) {
@@ -481,9 +477,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1,
-                       remainder2, remainder1->num_radix_blocks,
-                       radix_params.message_modulus,
-                       radix_params.carry_modulus);
+                       remainder2, remainder1->num_radix_blocks);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -398,7 +398,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
  *mem_ptr = new int_noise_squashing_lut<uint64_t>(
      (cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
      polynomial_size, num_radix_blocks, original_num_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -409,8 +409,7 @@ __host__ void host_pack_bivariate_blocks(
    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
    Torus const *lwe_indexes_out, CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, Torus const *lwe_indexes_in,
-    uint32_t shift, uint32_t num_radix_blocks, uint32_t const message_modulus,
-    uint32_t const carry_modulus) {
+    uint32_t shift, uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -434,15 +433,6 @@ __host__ void host_pack_bivariate_blocks(
          (Torus *)lwe_array_1->ptr, (Torus *)lwe_array_2->ptr, lwe_indexes_in,
          lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
-
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    lwe_array_out->degrees[i] =
-        lwe_array_1->degrees[i] * shift + lwe_array_2->degrees[i];
-    lwe_array_out->noise_levels[i] =
-        lwe_array_1->noise_levels[i] * shift + lwe_array_2->noise_levels[i];
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
-                      carry_modulus);
-  }
 }

 // polynomial_size threads
@@ -531,7 +521,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  if (num_radix_blocks > lut->num_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of lut radix blocks")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks)
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
+      num_radix_blocks > lwe_array_in->num_radix_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of input & output radix blocks")

@@ -607,8 +598,6 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
    auto degrees_index = lut->h_lut_indexes[i];
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
@@ -714,8 +703,6 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
    auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
@@ -764,7 +751,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  host_pack_bivariate_blocks<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
      lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
-      shift, num_radix_blocks, params.message_modulus, params.carry_modulus);
+      shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());

  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -832,8 +819,6 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    auto degrees_index = lut->h_lut_indexes[i];
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
@@ -1462,8 +1447,6 @@ void host_full_propagate_inplace(
    auto degrees_index = mem_ptr->lut->h_lut_indexes[0];
    input_blocks->degrees[i] = mem_ptr->lut->degrees[degrees_index];
    input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(input_blocks->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);

    if (i < num_blocks - 1) {
      CudaRadixCiphertextFFI next_input_block;
@@ -1474,8 +1457,7 @@ void host_full_propagate_inplace(
                                       mem_ptr->tmp_big_lwe_vector, 1, 2);

      host_addition<Torus>(streams[0], gpu_indexes[0], &next_input_block,
-                           &next_input_block, &second_input, 1,
-                           params.message_modulus, params.carry_modulus);
+                           &next_input_block, &second_input, 1);
    }
  }
 }
@@ -1491,7 +1473,7 @@ uint64_t scratch_cuda_full_propagation(cudaStream_t const *streams,
  uint64_t size_tracker = 0;
  *mem_ptr =
      new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
-                                     allocate_gpu_memory, size_tracker);
+                                     allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -1657,7 +1639,7 @@ __host__ void reduce_signs(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, reduce_two_orderings_function, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
@@ -1688,7 +1670,7 @@ __host__ void reduce_signs(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, final_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
                       num_sign_blocks, message_modulus);
@@ -1708,7 +1690,7 @@ __host__ void reduce_signs(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, final_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
@@ -1726,7 +1708,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory,
-                                      size_tracker);
+                                      &size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1734,7 +1716,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams[0], gpu_indexes[0], allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
  return size_tracker;
 }

@@ -1762,7 +1744,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, num_many_lut,
-                                      allocate_gpu_memory, size_tracker);
+                                      allocate_gpu_memory, &size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1770,7 +1752,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams[0], gpu_indexes[0], allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
  return size_tracker;
 }

@@ -1798,7 +1780,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory,
-                                      size_tracker);
+                                      &size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1806,7 +1788,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams[0], gpu_indexes[0], allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
  return size_tracker;
 }

@@ -1836,7 +1818,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_sc_prop_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
-      uses_carry, allocate_gpu_memory, size_tracker);
+      uses_carry, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }
 // This function perform the three steps of Thomas' new carry propagation
@@ -1853,6 +1835,9 @@ void host_propagate_single_carry(
  PUSH_RANGE("propagate sc")
  auto num_radix_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto lut_stride = mem->lut_stride;
  auto num_many_lut = mem->num_many_lut;
  CudaRadixCiphertextFFI output_flag;
@@ -1866,10 +1851,8 @@ void host_propagate_single_carry(
          "pointer")
  if (uses_carry == 1) {
    host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                         input_carries, 1, params.message_modulus,
-                         params.carry_modulus);
+                         input_carries, 1);
  }
-
  // Step 1
  host_compute_shifted_blocks_and_states<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
@@ -1893,8 +1876,7 @@ void host_propagate_single_carry(
  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
  host_addition<Torus>(
      streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
-      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
-      params.message_modulus, params.carry_modulus);
+      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
@@ -1903,8 +1885,7 @@ void host_propagate_single_carry(
        &shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
        num_radix_blocks - 1, num_radix_blocks);
    host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
-                         &shifted_simulators, 1, params.message_modulus,
-                         params.carry_modulus);
+                         &shifted_simulators, 1);
  }

  host_radix_sum_in_groups<Torus>(
@@ -1918,8 +1899,7 @@ void host_propagate_single_carry(
        mem->prop_simu_group_carries_mem->resolved_carries, mem->num_groups - 1,
        mem->num_groups);
    host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
-                         &shifted_resolved_carries, 1, params.message_modulus,
-                         params.carry_modulus);
+                         &shifted_resolved_carries, 1);

    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
@@ -1958,26 +1938,15 @@ void host_add_and_propagate_single_carry(
  PUSH_RANGE("add & propagate sc")
  if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
-
-  // Check input carries if used
-  if (uses_carry == 1) {
-    if (input_carries == nullptr)
-      PANIC("Cuda error: if uses_carry is enabled, input_carries cannot be a "
-            "null pointer");
-    if (lhs_array->lwe_dimension != input_carries->lwe_dimension)
-      PANIC(
-          "Cuda error: input and input_carries lwe dimension must be the same");
-  }
-
-  // Allow nullptr for carry_out if FLAG_NONE is requested
+  if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
+      lhs_array->lwe_dimension != input_carries->lwe_dimension ||
+      lhs_array->lwe_dimension != carry_out->lwe_dimension)
+    PANIC("Cuda error: input and output lwe dimension must be the same")
  if ((requested_flag == outputFlag::FLAG_OVERFLOW ||
-       requested_flag == outputFlag::FLAG_CARRY)) {
-    if (carry_out == nullptr)
-      PANIC("Cuda error: when requesting FLAG_CARRY or FLAG_OVERFLOW, "
-            "carry_out must be a valid pointer")
-    if (lhs_array->lwe_dimension != carry_out->lwe_dimension)
-      PANIC("Cuda error: input and carry_out lwe dimension must be the same")
-  }
+       requested_flag == outputFlag::FLAG_CARRY) &&
+      carry_out == nullptr)
+    PANIC("Cuda error: when requesting FLAG_CARRY, carry_out must be a valid "
+          "pointer")

  auto num_radix_blocks = lhs_array->num_radix_blocks;
  auto params = mem->params;
@@ -2000,13 +1969,11 @@ void host_add_and_propagate_single_carry(
  }

  host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
-                       rhs_array, num_radix_blocks, params.message_modulus,
-                       params.carry_modulus);
+                       rhs_array, num_radix_blocks);

  if (uses_carry == 1) {
    host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
-                         input_carries, 1, params.message_modulus,
-                         params.carry_modulus);
+                         input_carries, 1);
  }
  // Step 1
  host_compute_shifted_blocks_and_states<Torus>(
@@ -2037,8 +2004,7 @@ void host_add_and_propagate_single_carry(
  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
  host_addition<Torus>(
      streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
-      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
-      params.message_modulus, params.carry_modulus);
+      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
@@ -2047,8 +2013,7 @@ void host_add_and_propagate_single_carry(
        &shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
        num_radix_blocks - 1, num_radix_blocks);
    host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
-                         &shifted_simulators, 1, params.message_modulus,
-                         params.carry_modulus);
+                         &shifted_simulators, 1);
  }

  // Step 3
@@ -2063,8 +2028,7 @@ void host_add_and_propagate_single_carry(
    if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
        uses_carry == 1) {
      host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
-                           &output_flag, input_carries, 1,
-                           params.message_modulus, params.carry_modulus);
+                           &output_flag, input_carries, 1);

    } else {
      CudaRadixCiphertextFFI shifted_resolved_carries;
@@ -2073,8 +2037,7 @@ void host_add_and_propagate_single_carry(
          mem->prop_simu_group_carries_mem->resolved_carries,
          mem->num_groups - 1, mem->num_groups);
      host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
-                           &output_flag, &shifted_resolved_carries, 1,
-                           params.message_modulus, params.carry_modulus);
+                           &output_flag, &shifted_resolved_carries, 1);
    }
    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
@@ -2087,7 +2050,6 @@ void host_add_and_propagate_single_carry(
    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], lhs_array, 0, num_radix_blocks,
        mem->output_flag, 0, num_radix_blocks);
-
    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], carry_out, 0, 1, mem->output_flag,
        num_radix_blocks, num_radix_blocks + 1);
@@ -2109,7 +2071,7 @@ uint64_t scratch_cuda_integer_overflowing_sub(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_borrow_prop_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      compute_overflow, allocate_gpu_memory, size_tracker);
+      compute_overflow, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -2179,8 +2141,7 @@ void host_single_borrow_propagate(
        &shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
        num_radix_blocks - 1, num_radix_blocks);
    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block, &shifted_simulators, 1,
-                         params.message_modulus, params.carry_modulus);
+                         mem->overflow_block, &shifted_simulators, 1);
  }
  CudaRadixCiphertextFFI resolved_borrows;
  as_radix_ciphertext_slice<Torus>(
@@ -2192,8 +2153,7 @@ void host_single_borrow_propagate(
  //  borrows
  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block, &resolved_borrows, 1,
-                         params.message_modulus, params.carry_modulus);
+                         mem->overflow_block, &resolved_borrows, 1);
  }

  cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
@@ -2343,8 +2303,6 @@ __host__ void integer_radix_apply_noise_squashing_kb(
  for (uint i = 0; i < lut->num_blocks; i++) {
    lwe_array_out->degrees[i] = lut->degrees[0];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -210,8 +210,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
+    bool allocate_gpu_memory, bool allocate_ms_array) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -221,8 +220,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
-      max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
-      allocate_gpu_memory);
+      max_num_radix_in_vec, params, allocate_gpu_memory);
 }

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
@@ -236,11 +234,69 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
-  host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-      radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-      radix_lwe_out->num_radix_blocks,
-      radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+  // FIXME: this should not be necessary, we should make sure sum_ctxt works in
+  // the general case
+  for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
+    radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
+  }
+  switch (mem->params.polynomial_size) {
+  case 512:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 1024:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<1024>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 2048:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<2048>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 4096:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<4096>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 8192:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<8192>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 16384:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<16384>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  default:
+    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
+          "Supported N's are powers of two in the interval [256..16384].")
+  }
 }

 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -24,6 +24,24 @@
 #include <string>
 #include <vector>

+template <typename Torus>
+__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
+                           int32_t *id_in, size_t lwe_size) {
+  size_t tid = threadIdx.x;
+  size_t b_id = blockIdx.x;
+  size_t stride = blockDim.x;
+
+  auto input_id = id_in[b_id];
+  auto output_id = id_out[b_id];
+
+  auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
+  auto cur_dst = &dst[output_id * lwe_size];
+
+  for (int i = tid; i < lwe_size; i += stride) {
+    cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
+  }
+}
+
 template <typename Torus, class params>
 __global__ void
 all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -76,129 +94,35 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
-                                            uint32_t *const columns_counter,
-                                            const uint64_t *const degrees,
-                                            const uint32_t num_radix_blocks,
-                                            const uint32_t num_radix_in_vec) {
-
-  const uint32_t idx = threadIdx.x;
-  size_t cnt = 0;
-  for (int i = 0; i < num_radix_in_vec; i++) {
-    size_t ct_id = i * num_radix_blocks + idx;
-    if (degrees[ct_id] != 0) {
-      columns[idx][cnt] = ct_id;
-      ++cnt;
-    }
-  }
-  columns_counter[idx] = cnt;
-}
-
-__global__ inline void prepare_new_columns(
-    uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
-    const uint32_t *const *const columns, const uint32_t *const columns_counter,
-    const uint32_t chunk_size) {
-
-  const uint32_t base_id = threadIdx.x;
-  const uint32_t column_len = columns_counter[base_id];
-
-  uint32_t ct_count = 0;
-  for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
-    // those indexes are for message ciphertexts
-    // for message ciphertexts in and out index should be same
-    const uint32_t in_index = columns[base_id][i];
-    new_columns[base_id][ct_count] = in_index;
-    ++ct_count;
-  }
-
-  if (base_id > 0) {
-    const uint32_t prev_base_id = base_id - 1;
-    const uint32_t prev_column_len = columns_counter[prev_base_id];
-
-    for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
-      // those indexes are for carry ciphertexts
-      // for carry ciphertexts input is same as for message
-      // output will be placed to next block in the column
-      const uint32_t out_index = columns[prev_base_id][i + 1];
-      new_columns[base_id][ct_count] = out_index;
-      ++ct_count;
-    }
-  }
-
-  const uint32_t start_index = column_len - column_len % chunk_size;
-  for (uint32_t i = start_index; i < column_len; ++i) {
-    new_columns[base_id][ct_count] = columns[base_id][i];
-    ++ct_count;
-  }
-
-  new_columns_counter[base_id] = ct_count;
-}
-
 template <typename Torus>
-__global__ void calculate_chunks(Torus *const input_blocks,
-                                 const uint32_t *const *const columns,
-                                 const uint32_t *const columns_counter,
-                                 const uint32_t chunk_size,
-                                 const uint32_t block_size) {
+__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
+                                uint32_t chunk_size, uint32_t block_size,
+                                uint32_t num_blocks) {

-  const uint32_t part_size = blockDim.x;
-  const uint32_t base_id = blockIdx.x;
-  const uint32_t part_id = blockIdx.y;
-  const uint32_t coef_id = part_id * part_size + threadIdx.x;
+  size_t stride = blockDim.x;
+  size_t chunk_id = blockIdx.x;
+  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
+  size_t radix_elem_size = num_blocks * block_size;
+  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
+  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
+  size_t block_stride = blockIdx.y * block_size;
+  auto result = &dst_radix[block_stride];

-  if (coef_id >= block_size)
-    return;
+  // init shared mem with first radix of chunk
+  size_t tid = threadIdx.x;
+  for (int i = tid; i < block_size; i += stride) {
+    result[i] = src_chunk[block_stride + i];
+  }

-  const uint32_t column_len = columns_counter[base_id];
-
-  if (column_len >= chunk_size) {
-    const uint32_t num_chunks = column_len / chunk_size;
-    Torus result = 0;
-
-    for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
-      const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
-      result = input_blocks[first_ct_id * block_size + coef_id];
-
-      for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
-        const uint32_t cur_ct_id =
-            columns[base_id][chunk_id * chunk_size + ct_id];
-        result += input_blocks[cur_ct_id * block_size + coef_id];
-      }
-
-      input_blocks[first_ct_id * block_size + coef_id] = result;
+  // accumulate rest  of the radixes
+  for (int r_id = 1; r_id < chunk_size; r_id++) {
+    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
+    for (int i = tid; i < block_size; i += stride) {
+      result[i] += cur_src_radix[block_stride + i];
    }
  }
 }

-template <typename Torus>
-__global__ void calculate_final_chunk_into_radix(
-    Torus *const out_radix, const Torus *const input_blocks,
-    const uint32_t *const *const columns, const uint32_t *const columns_counter,
-    const uint32_t chunk_size, const uint32_t block_size) {
-
-  const uint32_t part_size = blockDim.x;
-  const uint32_t base_id = blockIdx.x;
-  const uint32_t part_id = blockIdx.y;
-  const uint32_t coef_id = part_id * part_size + threadIdx.x;
-
-  if (coef_id >= block_size)
-    return;
-
-  const uint32_t column_len = columns_counter[base_id];
-
-  Torus result = 0;
-  if (column_len) {
-    const uint32_t first_ct_id = columns[base_id][0];
-    result = input_blocks[first_ct_id * block_size + coef_id];
-
-    for (uint32_t i = 1; i < column_len; ++i) {
-      const uint32_t cur_ct_it = columns[base_id][i];
-      result += input_blocks[cur_ct_it * block_size + coef_id];
-    }
-  }
-  out_radix[base_id * block_size + coef_id] = result;
-}
-
 template <typename Torus, class params>
 __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
                                        Torus *msb_blocks,
@@ -243,33 +167,29 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
        (process_msb) ? cur_msb_ct[params::degree] : 0;
  }
 }
-
 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
-    bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
-    bool allocate_gpu_memory) {
+    int_radix_params params, bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
-      max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
-      allocate_gpu_memory, size_tracker);
+      max_num_radix_in_vec, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

-template <typename Torus>
+template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
-  auto big_lwe_size = big_lwe_dimension + 1;
+    uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
+    int_radix_lut<Torus> *reused_lut) {

  if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
    PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -279,201 +199,273 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    PANIC("Cuda error: input vector does not have enough blocks")
  if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
    PANIC("Cuda error: output does not have enough blocks")
-  if (num_radix_in_vec == 0)
-    return;
-
-  auto current_blocks = mem_ptr->current_blocks;
+  auto new_blocks = mem_ptr->new_blocks;
+  auto new_blocks_copy = mem_ptr->new_blocks_copy;
+  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;
-  auto d_degrees = mem_ptr->d_degrees;
-  auto d_columns = mem_ptr->d_columns;
-  auto d_columns_counter = mem_ptr->d_columns_counter;
-  auto d_new_columns = mem_ptr->d_new_columns;
-  auto d_new_columns_counter = mem_ptr->d_new_columns_counter;

+  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
+  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
+
+  auto message_modulus = mem_ptr->params.message_modulus;
+  auto carry_modulus = mem_ptr->params.carry_modulus;
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto chunk_size =
-      (mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
-      (mem_ptr->params.message_modulus - 1);
-
-  size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
+  auto small_lwe_size = small_lwe_dimension + 1;

  // In the case of extracting a single LWE this parameters are dummy
  uint32_t num_many_lut = 1;
  uint32_t lut_stride = 0;

-  if (terms->num_radix_blocks == 0) {
+  if (terms->num_radix_blocks == 0)
    return;
-  }
  if (num_radix_in_vec == 1) {
    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
                                             radix_lwe_out, 0, num_radix_blocks,
                                             terms, 0, num_radix_blocks);
    return;
  }
-
+  if (old_blocks != terms) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
+                                       terms);
+  }
  if (num_radix_in_vec == 2) {
-    CudaRadixCiphertextFFI terms_slice;
-    as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
-                                     2 * num_radix_blocks);
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
-                         &terms_slice, num_radix_blocks,
-                         mem_ptr->params.message_modulus,
-                         mem_ptr->params.carry_modulus);
+    CudaRadixCiphertextFFI old_blocks_slice;
+    as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                     num_radix_blocks, 2 * num_radix_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks_slice, num_radix_blocks);
    return;
  }

-  if (current_blocks != terms) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                       current_blocks, terms);
+  size_t r = num_radix_in_vec;
+  size_t total_modulus = message_modulus * carry_modulus;
+  size_t message_max = message_modulus - 1;
+  size_t chunk_size = (total_modulus - 1) / message_max;
+
+  size_t h_lwe_idx_in[terms->num_radix_blocks];
+  size_t h_lwe_idx_out[terms->num_radix_blocks];
+  int32_t h_smart_copy_in[terms->num_radix_blocks];
+  int32_t h_smart_copy_out[terms->num_radix_blocks];
+
+  /// Here it is important to query the default max shared memory on device 0
+  /// instead of cuda_get_max_shared_memory,
+  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
+  auto max_shared_memory = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
+
+  // create lut object for message and carry
+  // we allocate luts_message_carry in the host function (instead of scratch)
+  // to reduce average memory consumption
+  int_radix_lut<Torus> *luts_message_carry;
+  size_t ch_amount = r / chunk_size;
+  if (!ch_amount)
+    ch_amount++;
+  if (reused_lut == nullptr) {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_radix_blocks, true, nullptr);
+  } else {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
  }
+  auto message_acc = luts_message_carry->get_lut(0, 0);
+  auto carry_acc = luts_message_carry->get_lut(0, 1);

-  cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
-                           total_blocks_in_vec * sizeof(uint64_t), streams[0],
-                           gpu_indexes[0]);
+  // define functions for each accumulator
+  auto lut_f_message = [message_modulus](Torus x) -> Torus {
+    return x % message_modulus;
+  };
+  auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+    return x / message_modulus;
+  };

-  cuda_set_device(gpu_indexes[0]);
-  radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
-      d_columns, d_columns_counter, d_degrees, num_radix_blocks,
-      num_radix_in_vec);
+  // generate accumulators
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], message_acc,
+      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
+      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+      lut_f_message, true);
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
+      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
+      message_modulus, carry_modulus, lut_f_carry, true);
+  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

-  bool needs_processing = false;
-  radix_columns<Torus> current_columns(current_blocks->degrees,
-                                       num_radix_blocks, num_radix_in_vec,
-                                       chunk_size, needs_processing);
-  int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
-  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
-  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
+  while (r > 2) {
+    size_t cur_total_blocks = r * num_radix_blocks;
+    size_t ch_amount = r / chunk_size;
+    if (!ch_amount)
+      ch_amount++;
+    dim3 add_grid(ch_amount, num_radix_blocks, 1);

-  mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count,
-                               num_radix_in_vec, current_blocks->degrees);
-
-  while (needs_processing) {
-    auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
-    auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
    cuda_set_device(gpu_indexes[0]);
-    calculate_chunks<Torus>
-        <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
-            (Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
-            chunk_size, big_lwe_size);
+    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
+        (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
+        std::min(r, chunk_size), big_lwe_size, num_radix_blocks);

-    prepare_new_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
-        d_new_columns, d_new_columns_counter, d_columns, d_columns_counter,
-        chunk_size);
+    check_cuda_error(cudaGetLastError());

-    uint32_t total_ciphertexts = 0;
-    uint32_t total_messages = 0;
-    current_columns.next_accumulation(luts_message_carry->h_lwe_indexes_in,
-                                      luts_message_carry->h_lwe_indexes_out,
-                                      luts_message_carry->h_lut_indexes,
-                                      total_ciphertexts, total_messages,
-                                      needs_processing);
+    size_t total_count = 0;
+    size_t message_count = 0;
+    size_t carry_count = 0;
+    size_t sm_copy_count = 0;
+
+    generate_ids_update_degrees(
+        terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
+        h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
+        message_max, total_count, message_count, carry_count, sm_copy_count);
+    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
+    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
-                                        luts_message_carry->h_lwe_indexes_in,
-                                        luts_message_carry->h_lwe_indexes_out);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        luts_message_carry->get_lut_indexes(0, 0),
-        luts_message_carry->h_lut_indexes,
-        luts_message_carry->num_blocks * sizeof(Torus), streams[0],
-        gpu_indexes[0], true);
-    luts_message_carry->broadcast_lut(streams, gpu_indexes);
+                                        h_lwe_idx_in, h_lwe_idx_out);

-    auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
+    size_t copy_size = sm_copy_count * sizeof(int32_t);
+    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
+                             streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
+                             streams[0], gpu_indexes[0]);
+
+    // inside d_smart_copy_in there are only -1 values
+    // it's fine to call smart_copy with same pointer
+    // as source and destination
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
+        new_blocks, 0, r * num_radix_blocks);
+    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
+        (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
+        d_smart_copy_out, d_smart_copy_in, big_lwe_size);
+    check_cuda_error(cudaGetLastError());
+
+    if (carry_count > 0)
+      cuda_set_value_async<Torus>(
+          streams[0], gpu_indexes[0],
+          luts_message_carry->get_lut_indexes(0, message_count), 1,
+          carry_count);
+
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
+    std::vector<Torus *> small_lwe_vector_vec =
+        luts_message_carry->lwe_after_ks_vec;
+    std::vector<Torus *> lwe_after_pbs_vec =
+        luts_message_carry->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec =
+        luts_message_carry->lwe_trivial_indexes_vec;
+
+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
    if (active_gpu_count == 1) {
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
      execute_keyswitch_async<Torus>(
          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
+          lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
+          polynomial_size * glwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
+          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
+          (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
+          ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
+          small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
+          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+    } else {
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, new_blocks_vec,
+          (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          big_lwe_size);
+
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
          ksks, big_lwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
-          total_messages);
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);

+      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
+      /// different configuration
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
+          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          small_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          small_lwe_size);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
-          d_pbs_indexes_out, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
+          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
          polynomial_size, mem_ptr->params.pbs_base_log,
          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
-          lut_stride);
-    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
-          ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
+          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
+          lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          big_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
    }
-    std::swap(d_columns, d_new_columns);
-    std::swap(d_columns_counter, d_new_columns_counter);
-  }
-
-  cuda_set_device(gpu_indexes[0]);
-  calculate_final_chunk_into_radix<Torus>
-      <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
-          (Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
-          d_columns, d_columns_counter, chunk_size, big_lwe_size);
-
-  if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
-    auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
-    auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
-    auto h_pbs_indexes_in = mem_ptr->luts_message_carry->h_lwe_indexes_in;
-    auto h_pbs_indexes_out = mem_ptr->luts_message_carry->h_lwe_indexes_out;
-    auto h_lut_indexes = mem_ptr->luts_message_carry->h_lut_indexes;
-    current_columns.final_calculation(luts_message_carry->h_lwe_indexes_in,
-                                      luts_message_carry->h_lwe_indexes_out,
-                                      luts_message_carry->h_lut_indexes);
-    mem_ptr->luts_message_carry->set_lwe_indexes(
-        streams[0], gpu_indexes[0], h_pbs_indexes_in, h_pbs_indexes_out);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        luts_message_carry->get_lut_indexes(0, 0), h_lut_indexes,
-        2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0], true);
-    luts_message_carry->broadcast_lut(streams, gpu_indexes);
-
-    set_zero_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
-        num_radix_blocks + 1);
-
-    auto active_gpu_count =
-        get_active_gpu_count(2 * num_radix_blocks, gpu_count);
-
-    if (active_gpu_count == 1) {
-      execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
-          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
-          mem_ptr->params.ks_level, num_radix_blocks);
-
-      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
-          d_pbs_indexes_out, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
-          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
-          polynomial_size, mem_ptr->params.pbs_base_log,
-          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
-          lut_stride);
-    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
-          bsks, ksks, ms_noise_reduction_key, luts_message_carry,
-          2 * num_radix_blocks);
+    for (uint i = 0; i < total_count; i++) {
+      auto degrees_index = luts_message_carry->h_lut_indexes[i];
+      new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
+      new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
    }
-    calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
-                            num_radix_blocks, num_radix_in_vec, chunk_size,
-                            mem_ptr->params.message_modulus);
-    cuda_set_device(gpu_indexes[0]);
-    CudaRadixCiphertextFFI current_blocks_slice;
-    as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
-                                     num_radix_blocks, 2 * num_radix_blocks);

-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
-                         current_blocks, &current_blocks_slice,
-                         num_radix_blocks, mem_ptr->params.message_modulus,
-                         mem_ptr->params.carry_modulus);
+    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
+    int new_blocks_created = 2 * ch_amount * num_radix_blocks;
+
+    if (rem_blocks > 0)
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
+          new_blocks_created + rem_blocks, old_blocks,
+          cur_total_blocks - rem_blocks, cur_total_blocks);
+    std::swap(new_blocks, old_blocks);
+    r = (new_blocks_created + rem_blocks) / num_radix_blocks;
  }
+  luts_message_carry->release(streams, gpu_indexes, gpu_count);
+  delete (luts_message_carry);
+
+  CudaRadixCiphertextFFI old_blocks_slice;
+  as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                   num_radix_blocks, 2 * num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks_slice, num_radix_blocks);
 }

 template <typename Torus, class params>
@@ -607,10 +599,10 @@ __host__ void host_integer_mult_radix_kb(
    size_t b_id = i % num_blocks;
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }
-  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
+  host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
      ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
-      2 * num_blocks);
+      2 * num_blocks, mem_ptr->luts_array);

  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -631,7 +623,7 @@ __host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_mul_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, is_boolean_left,
-      is_boolean_right, num_radix_blocks, allocate_gpu_memory, size_tracker);
+      is_boolean_right, num_radix_blocks, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -106,8 +106,6 @@ __host__ void host_integer_radix_negation(

    lwe_array_out->degrees[i] = z - static_cast<uint64_t>(zb);
    lwe_array_out->noise_levels[i] = lwe_array_in->noise_levels[i];
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
-                      carry_modulus);
    zb = z / message_modulus;
  }
 }
@@ -123,7 +121,7 @@ __host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_overflowing_sub_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
-      allocate_ms_array, size_tracker);
+      allocate_ms_array, &size_tracker);
  POP_RANGE()
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -13,7 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                        CudaRadixCiphertextFFI *radix,
                                        const uint32_t num_radix_blocks,
                                        const uint32_t lwe_dimension,
-                                        uint64_t &size_tracker,
+                                        uint64_t *size_tracker,
                                        bool allocate_gpu_memory) {
  PUSH_RANGE("create zero radix ct");
  radix->lwe_dimension = lwe_dimension;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -34,7 +34,7 @@ void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
    auto result = max;

    for (uint j = 0; j < min + 1; j++) {
-      if ((max | j) > result) {
+      if (max | j > result) {
        result = max | j;
      }
    }
@@ -52,7 +52,7 @@ void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,

    // Try every possibility to find the worst case
    for (uint j = 0; j < min + 1; j++) {
-      if ((max ^ j) > result) {
+      if (max ^ j > result) {
        result = max ^ j;
      }
    }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -47,7 +47,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                 num_clear_blocks * sizeof(Torus), streams[0],
                                 gpu_indexes[0]);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -154,7 +154,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, scalar_last_leaf_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -253,7 +253,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
@@ -277,9 +277,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        auto overflowed = x_0 < x_1;
        return (Torus)(invert_flags.second ^ overflowed);
      };
-      uint64_t size = 0;
      int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
@@ -287,7 +286,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true);

-      one_block_lut->broadcast_lut(streams, gpu_indexes);
+      one_block_lut->broadcast_lut(streams, gpu_indexes, 0);

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -434,7 +433,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
        lut->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
-    lut->broadcast_lut(streams, gpu_indexes);
+    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
@@ -540,7 +539,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, lut_f, true);
-    signed_msb_lut->broadcast_lut(streams, gpu_indexes);
+    signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);

    CudaRadixCiphertextFFI sign_block;
    as_radix_ciphertext_slice<Torus>(
@@ -579,9 +578,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
               is_x_less_than_y_given_input_borrow<Torus>(x_0, x_1, 0,
                                                          message_modulus);
      };
-      uint64_t size = 0;
      int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
@@ -589,7 +587,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true);

-      one_block_lut->broadcast_lut(streams, gpu_indexes);
+      one_block_lut->broadcast_lut(streams, gpu_indexes, 0);

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -819,7 +817,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
          num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
          gpu_indexes[0]);
    }
-    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes);
+    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
@@ -1,202 +0,0 @@
-#include "scalar_div.cuh"
-
-uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory);
-}
-
-void cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi) {
-
-  host_integer_unsigned_scalar_div_radix<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
-      (int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      ms_noise_reduction_key, scalar_divisor_ffi);
-}
-
-void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
-      (int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory);
-}
-
-void cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
-
-  host_integer_signed_scalar_div_radix_kb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
-      (int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
-}
-
-void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_signed_scalar_div_mem<uint64_t> *mem_ptr =
-      (int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
-}
-
-void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    void const *clear_blocks, void const *h_clear_blocks,
-    uint32_t num_clear_blocks) {
-
-  host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
-      remainder_ct, (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
-      bsks, (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
-      divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
-      (uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
-}
-
-void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
-      (int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
-}
-
-void cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    uint32_t numerator_bits) {
-
-  host_integer_signed_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
-      remainder_ct, (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
-      divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
-      numerator_bits);
-}
-
-void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
-      (int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
@@ -1,415 +0,0 @@
-#ifndef SCALAR_DIV_CUH
-#define SCALAR_DIV_CUH
-
-#include "integer/integer_utilities.h"
-#include "integer/scalar_bitops.cuh"
-#include "integer/scalar_mul.cuh"
-#include "integer/scalar_shifts.cuh"
-#include "integer/subtraction.cuh"
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const int_radix_params params,
-    int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_unsigned_scalar_div_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
-    int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi) {
-
-  if (scalar_divisor_ffi->is_abs_divisor_one) {
-    return;
-  }
-
-  if (scalar_divisor_ffi->is_divisor_pow2) {
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct,
-        scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
-        bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
-    return;
-  }
-
-  if (scalar_divisor_ffi->divisor_has_more_bits_than_numerator) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                       mem_ptr->tmp_ffi);
-    return;
-  }
-
-  if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
-
-    if (scalar_divisor_ffi->shift_pre != (uint64_t)0) {
-      PANIC("shift_pre should be == 0");
-    }
-
-    if (scalar_divisor_ffi->shift_post == (uint32_t)0) {
-      PANIC("shift_post should be > 0");
-    }
-
-    CudaRadixCiphertextFFI *numerator_cpy = mem_ptr->tmp_ffi;
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                       numerator_cpy, numerator_ct);
-
-    host_integer_radix_scalar_mul_high_kb<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_cpy,
-        mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
-        scalar_divisor_ffi);
-
-    host_sub_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
-        nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
-        ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
-        numerator_ct->num_radix_blocks);
-
-    host_add_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
-        nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
-        FLAG_NONE, (uint32_t)0);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct,
-        scalar_divisor_ffi->shift_post - (uint32_t)1,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
-        numerator_ct->num_radix_blocks);
-
-    return;
-  }
-
-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, numerator_ct,
-      scalar_divisor_ffi->shift_pre, mem_ptr->logical_scalar_shift_mem, bsks,
-      ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
-
-  host_integer_radix_scalar_mul_high_kb<Torus>(
-      streams, gpu_indexes, gpu_count, numerator_ct,
-      mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
-      scalar_divisor_ffi);
-
-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, numerator_ct,
-      scalar_divisor_ffi->shift_post, mem_ptr->logical_scalar_shift_mem, bsks,
-      ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
-}
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_radix_params params,
-    int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_signed_scalar_div_mem<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_signed_scalar_div_radix_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
-    int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
-
-  if (scalar_divisor_ffi->is_abs_divisor_one) {
-    if (scalar_divisor_ffi->is_divisor_negative) {
-      CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
-
-      host_integer_radix_negation<Torus>(
-          streams, gpu_indexes, gpu_count, tmp, numerator_ct,
-          mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
-          numerator_ct->num_radix_blocks);
-
-      copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                         numerator_ct, tmp);
-    }
-    return;
-  }
-
-  if (scalar_divisor_ffi->chosen_multiplier_has_more_bits_than_numerator) {
-    set_zero_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], numerator_ct, 0,
-        numerator_ct->num_radix_blocks);
-    return;
-  }
-
-  CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
-
-  if (scalar_divisor_ffi->is_divisor_pow2) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
-                                       numerator_ct);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp,
-        scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp,
-        numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
-        tmp->num_radix_blocks);
-
-    host_add_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
-        mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
-        (uint32_t)0);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp,
-        scalar_divisor_ffi->chosen_multiplier_num_bits,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-  } else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
-                                       numerator_ct);
-
-    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
-        ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
-                                       numerator_ct);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    host_sub_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
-        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
-        FLAG_NONE, (uint32_t)0);
-
-  } else {
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
-                                       numerator_ct);
-
-    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
-        ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
-
-    host_add_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
-        mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
-        (uint32_t)0);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
-                                       numerator_ct);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    host_sub_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
-        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
-        FLAG_NONE, (uint32_t)0);
-  }
-
-  if (scalar_divisor_ffi->is_divisor_negative) {
-    host_integer_radix_negation<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, tmp,
-        mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
-        numerator_ct->num_radix_blocks);
-  } else {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                       tmp);
-  }
-}
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const int_radix_params params,
-    int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-  *mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
-      size_tracker);
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_unsigned_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
-    CudaRadixCiphertextFFI *remainder_ct,
-    int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    Torus const *clear_blocks, Torus const *h_clear_blocks,
-    uint32_t num_clear_blocks) {
-
-  auto numerator_ct = mem_ptr->numerator_ct;
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                     quotient_ct);
-
-  host_integer_unsigned_scalar_div_radix(
-      streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->unsigned_div_mem,
-      bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi);
-
-  if (scalar_divisor_ffi->is_divisor_pow2) {
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       numerator_ct);
-    host_integer_radix_scalar_bitop_kb(
-        streams, gpu_indexes, gpu_count, remainder_ct, remainder_ct,
-        clear_blocks, h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
-        bsks, ksks, ms_noise_reduction_key);
-
-  } else {
-    if (!scalar_divisor_ffi->is_divisor_zero) {
-      copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                         remainder_ct, quotient_ct);
-
-      if (!scalar_divisor_ffi->is_abs_divisor_one &&
-          remainder_ct->num_radix_blocks != 0) {
-
-        host_integer_scalar_mul_radix<Torus>(
-            streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
-            divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
-            ms_noise_reduction_key, mem_ptr->params.message_modulus,
-            num_scalars_divisor);
-      }
-    }
-
-    host_sub_and_propagate_single_carry(
-        streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
-        nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
-        ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       numerator_ct);
-  }
-}
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const int_radix_params params,
-    int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
-      size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_signed_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
-    CudaRadixCiphertextFFI *remainder_ct,
-    int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    uint32_t numerator_bits) {
-
-  auto numerator_ct = mem_ptr->numerator_ct;
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                     quotient_ct);
-
-  host_integer_signed_scalar_div_radix_kb(
-      streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->signed_div_mem,
-      bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
-
-  host_propagate_single_carry<Torus>(
-      streams, gpu_indexes, gpu_count, quotient_ct, nullptr, nullptr,
-      mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
-      (uint32_t)0);
-
-  if (!scalar_divisor_ffi->is_divisor_negative &&
-      scalar_divisor_ffi->is_divisor_pow2) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       quotient_ct);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace(
-        streams, gpu_indexes, gpu_count, remainder_ct,
-        scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
-        bsks, ksks, ms_noise_reduction_key, remainder_ct->num_radix_blocks);
-
-  } else if (!scalar_divisor_ffi->is_divisor_zero) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       quotient_ct);
-
-    bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
-                          !scalar_divisor_ffi->is_divisor_negative;
-
-    if (!is_divisor_one && remainder_ct->num_radix_blocks != 0) {
-      host_integer_scalar_mul_radix<Torus>(
-          streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
-          divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
-          ms_noise_reduction_key, mem_ptr->params.message_modulus,
-          num_scalars_divisor);
-    }
-  }
-
-  host_sub_and_propagate_single_carry(
-      streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
-      nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
-      ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
-
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                     numerator_ct);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -6,8 +6,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -18,7 +17,28 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
  return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
-      num_scalar_bits, allocate_gpu_memory);
+      allocate_gpu_memory);
+}
+
+uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
+    bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_cuda_integer_radix_scalar_mul_high_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_scalar_mul_high<uint64_t> **)mem_ptr, num_blocks, params,
+      anticipated_buffer_drop, allocate_gpu_memory);
 }

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
@@ -29,11 +49,73 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {

-  host_integer_scalar_mul_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-      decomposed_scalar, has_at_least_one_set,
-      reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-      (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+  switch (polynomial_size) {
+  case 512:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 1024:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 2048:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 4096:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 8192:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 16384:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  default:
+    PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
+          "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
+  }
+}
+
+void cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
+    uint64_t rhs, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars) {
+
+  host_integer_radix_scalar_mul_high_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, ct,
+      (int_scalar_mul_high<uint64_t> *)mem_ptr, (uint64_t **)ksks, rhs,
+      decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
      num_scalars);
 }

@@ -47,3 +129,22 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,

  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_scalar_mul_high<uint64_t> *mem_ptr =
+      (int_scalar_mul_high<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
+}
+
+void cuda_small_scalar_multiplication_integer_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t scalar) {
+
+  host_integer_small_scalar_mul_radix<uint64_t>((cudaStream_t *)(streams),
+                                                gpu_indexes, gpu_count,
+                                                lwe_array, lwe_array, scalar);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -33,16 +33,16 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
-    uint32_t num_scalar_bits, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_scalar_mul_buffer<T>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      num_scalar_bits, allocate_gpu_memory, true, size_tracker);
+      allocate_gpu_memory, true, &size_tracker);
  return size_tracker;
 }

-template <typename T>
+template <typename T, class params>
 __host__ void host_integer_scalar_mul_radix(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
@@ -80,7 +80,7 @@ __host__ void host_integer_scalar_mul_radix(
    }
  }
  size_t j = 0;
-  for (size_t i = 0; i < std::min(num_scalars, num_ciphertext_bits); i++) {
+  for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) {
    if (decomposed_scalar[i] == 1) {
      // Perform a block shift
      CudaRadixCiphertextFFI preshifted_radix_ct;
@@ -116,10 +116,13 @@ __host__ void host_integer_scalar_mul_radix(
    set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
                                             lwe_array, 0, num_radix_blocks);
  } else {
-    host_integer_partial_sum_ciphertexts_vec_kb<T>(
+    for (int i = 0; i < j * num_radix_blocks; i++) {
+      all_shifted_buffer->degrees[i] = message_modulus - 1;
+    }
+    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
        ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j);
+        num_radix_blocks, j, nullptr);

    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -136,8 +139,7 @@ template <typename T>
 __host__ void host_integer_small_scalar_mul_radix(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
-    CudaRadixCiphertextFFI *input_lwe_array, T scalar,
-    const uint32_t message_modulus, const uint32_t carry_modulus) {
+    CudaRadixCiphertextFFI *input_lwe_array, T scalar) {

  if (output_lwe_array->num_radix_blocks != input_lwe_array->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -167,20 +169,35 @@ __host__ void host_integer_small_scalar_mul_radix(
    output_lwe_array->noise_levels[i] =
        input_lwe_array->noise_levels[i] * scalar;
    output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
-    CHECK_NOISE_LEVEL(output_lwe_array->noise_levels[i], message_modulus,
-                      carry_modulus);
  }
 }

+template <typename Torus>
+__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_scalar_mul_high<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params,
+    bool anticipated_buffer_drop, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_scalar_mul_high<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      allocate_gpu_memory, LEFT_SHIFT, anticipated_buffer_drop, &size_tracker);
+
+  return size_tracker;
+}
+
 template <typename Torus>
 __host__ void host_integer_radix_scalar_mul_high_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
-    int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
+    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
+    void *const *bsks, uint32_t num_scalars) {

-  if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
+  if (rhs == (uint64_t)0) {
    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
                                                 0, ct->num_radix_blocks);
    return;
@@ -191,71 +208,66 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
  host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
                                                        gpu_indexes);

-  if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
-      !scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
-      tmp_ffi->num_radix_blocks != 0) {
+  if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
+    if ((rhs & (rhs - 1)) == 0) {
+
+      uint32_t shift = std::log2(rhs);

-    if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->ilog2_chosen_multiplier,
+          streams, gpu_indexes, gpu_count, tmp_ffi, shift,
          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);

    } else {

-      host_integer_scalar_mul_radix<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->decomposed_chosen_multiplier,
-          scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
-          mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
-          ms_noise_reduction_key, mem_ptr->params.message_modulus,
-          scalar_divisor_ffi->num_scalars);
-    }
-  }
-
-  host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_signed_scalar_mul_high_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
-    int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *const *bsks) {
-
-  if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
-    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
-                                                 0, ct->num_radix_blocks);
-    return;
-  }
-
-  CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
-
-  host_extend_radix_with_sign_msb<Torus>(
-      streams, gpu_indexes, gpu_count, tmp_ffi, ct, mem_ptr->extend_radix_mem,
-      ct->num_radix_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
-
-  if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
-      !scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
-      tmp_ffi->num_radix_blocks != 0) {
-
-    if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->ilog2_chosen_multiplier,
-          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
-          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
-    } else {
-      host_integer_scalar_mul_radix<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->decomposed_chosen_multiplier,
-          scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
-          mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
-          ms_noise_reduction_key, mem_ptr->params.message_modulus,
-          scalar_divisor_ffi->num_scalars);
+      switch (mem_ptr->params.polynomial_size) {
+      case 512:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 1024:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 2048:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 4096:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 8192:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 16384:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      default:
+        PANIC(
+            "Cuda error (scalar multiplication): unsupported polynomial size. "
+            "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
+      }
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -133,7 +133,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_shift_and_rotate_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
-      num_radix_blocks, allocate_gpu_memory, size_tracker);
+      num_radix_blocks, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -159,13 +159,11 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // control_bit|b|a
    host_pack_bivariate_blocks<Torus>(
        streams, gpu_indexes, gpu_count, mux_inputs, mux_lut->lwe_indexes_out,
-        rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
-        mem->params.message_modulus, mem->params.carry_modulus);
+        rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits);

    // The shift bit is already properly aligned/positioned
    host_add_the_same_block_to_all_blocks<Torus>(
-        streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit,
-        mem->params.message_modulus, mem->params.carry_modulus);
+        streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit);

    // we have
    // control_bit|b|a
@@ -185,9 +183,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

  // Bitshift and add the other bits
  for (int i = bits_per_block - 2; i >= 0; i--) {
-    host_integer_small_scalar_mul_radix<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array, lwe_array, 2,
-        mem->params.message_modulus, mem->params.carry_modulus);
+    host_integer_small_scalar_mul_radix<Torus>(streams, gpu_indexes, gpu_count,
+                                               lwe_array, lwe_array, 2);
    for (int j = 0; j < num_radix_blocks; j++) {
      CudaRadixCiphertextFFI block;
      CudaRadixCiphertextFFI bit_to_add;
@@ -196,8 +193,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
                                       i + j * bits_per_block,
                                       i + j * bits_per_block + 1);
      host_addition<Torus>(streams[0], gpu_indexes[0], &block, &block,
-                           &bit_to_add, 1, mem->params.message_modulus,
-                           mem->params.carry_modulus);
+                           &bit_to_add, 1);
    }

    // To give back a clean ciphertext
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -23,7 +23,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry(

  *mem_ptr = new int_sub_and_propagate<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);

  return size_tracker;
 }
@@ -73,7 +73,6 @@ __host__ void host_integer_radix_subtraction(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in_2,
      message_modulus, carry_modulus, num_radix_blocks);
  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_out,
-                       lwe_array_in_1, num_radix_blocks, message_modulus,
-                       carry_modulus);
+                       lwe_array_in_1, num_radix_blocks);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -10,7 +10,7 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
      output->num_radix_blocks != input_2->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
-                          input_1, input_2, output->num_radix_blocks, 0, 0);
+                          input_1, input_2, output->num_radix_blocks);
 }

 /*
@@ -48,7 +48,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
      output->num_radix_blocks != input_2->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
-                          input_1, input_2, output->num_radix_blocks, 0, 0);
+                          input_1, input_2, output->num_radix_blocks);
 }

 /*
@@ -147,3 +147,44 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
      static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
      input_lwe_dimension, input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the subtraction of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext vector.
+ * - `stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_array_in` is the plaintext vector used as input, it should have
+ * been allocated and initialized before calling this function. It should be of
+ * size `input_lwe_ciphertext_count`.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output. It is also the number
+ * of plaintexts in the input plaintext vector.
+ *
+ * Each plaintext of the input plaintext vector is subtracted to the body of the
+ * corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
+void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void const *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count) {
+
+  host_subtraction_plaintext<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -9,7 +9,6 @@
 #include "device.h"
 #include "helper_multi_gpu.h"
 #include "integer/integer.h"
-#include "integer/integer_utilities.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 #include <stdio.h>
@@ -103,12 +102,11 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
 // Coefficient-wise addition
 // num_radix_blocks selects the amount of blocks to be added from the inputs
 template <typename T>
-__host__ void
-host_addition(cudaStream_t stream, uint32_t gpu_index,
-              CudaRadixCiphertextFFI *output,
-              CudaRadixCiphertextFFI const *input_1,
-              CudaRadixCiphertextFFI const *input_2, uint32_t num_radix_blocks,
-              const uint32_t message_modulus, const uint32_t carry_modulus) {
+__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
+                            CudaRadixCiphertextFFI *output,
+                            CudaRadixCiphertextFFI const *input_1,
+                            CudaRadixCiphertextFFI const *input_2,
+                            uint32_t num_radix_blocks) {
  if (output->lwe_dimension != input_1->lwe_dimension ||
      output->lwe_dimension != input_2->lwe_dimension)
    PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -137,7 +135,6 @@ host_addition(cudaStream_t stream, uint32_t gpu_index,
    output->degrees[i] = input_1->degrees[i] + input_2->degrees[i];
    output->noise_levels[i] =
        input_1->noise_levels[i] + input_2->noise_levels[i];
-    CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
  }
 }

@@ -163,8 +160,7 @@ template <typename T>
 __host__ void host_add_the_same_block_to_all_blocks(
    cudaStream_t stream, uint32_t gpu_index, CudaRadixCiphertextFFI *output,
    CudaRadixCiphertextFFI const *input_with_multiple_blocks,
-    CudaRadixCiphertextFFI const *input_with_single_block,
-    const uint32_t message_modulus, const uint32_t carry_modulus) {
+    CudaRadixCiphertextFFI const *input_with_single_block) {
  if (output->num_radix_blocks != input_with_multiple_blocks->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
  if (input_with_single_block->num_radix_blocks != 1)
@@ -196,7 +192,6 @@ __host__ void host_add_the_same_block_to_all_blocks(
                         input_with_single_block->degrees[0];
    output->noise_levels[i] = input_with_multiple_blocks->noise_levels[i] +
                              input_with_single_block->noise_levels[0];
-    CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
  }
 }

@@ -276,7 +271,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
 }

 template <typename T>
-__global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
+__global__ void radix_body_subtraction_inplace(T *lwe_ct,
+                                               const T *plaintext_input,
                                               uint32_t input_lwe_dimension,
                                               uint32_t num_entries) {

@@ -293,7 +289,7 @@ __global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
 template <typename T>
 __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                         uint32_t gpu_index, T *output,
-                                         T *lwe_input, T *plaintext_input,
+                                         T *lwe_input, const T *plaintext_input,
                                         uint32_t input_lwe_dimension,
                                         uint32_t input_lwe_ciphertext_count) {

@@ -385,7 +381,6 @@ __host__ void host_unchecked_sub_with_correcting_term(
    output->noise_levels[i] =
        input_1->noise_levels[i] + input_2->noise_levels[i];
    zb = z / message_modulus;
-    CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -35,20 +35,6 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
                           static_cast<cudaStream_t>(stream), gpu_index);
 }

-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size, uint32_t grouping_factor) {
-  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
-                               level_count * (1 << grouping_factor) /
-                               grouping_factor;
-  size_t buffer_size =
-      total_polynomials * polynomial_size * sizeof(__uint128_t);
-
-  cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
-                           static_cast<cudaStream_t>(stream), gpu_index);
-}
-
 // We need these lines so the compiler knows how to specialize these functions
 template __device__ const uint64_t *
 get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
@@ -94,14 +80,6 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
                                                    int glwe_dimension,
                                                    uint32_t level_count);

-template __device__ const __uint128_t *
-get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
-                                          int k, int level,
-                                          uint32_t grouping_factor,
-                                          uint32_t polynomial_size,
-                                          uint32_t glwe_dimension,
-                                          uint32_t level_count);
-
 template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
    const uint64_t *ptr, int g, int i, int k, int level,
    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -83,62 +83,6 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
  __syncthreads();
 }

-/** Perform the matrix multiplication between the GGSW and the GLWE,
- * each block operating on a single level for mask and body.
- * Both operands should be at fourier domain
- *
- * This function assumes:
- *  - Thread blocks at dimension z relates to the decomposition level.
- *  - Thread blocks at dimension y relates to the glwe dimension.
- *  - polynomial_size / params::opt threads are available per block
- */
-template <typename G, class params>
-__device__ void mul_ggsw_glwe_in_fourier_domain_128(
-    double *fft, double *join_buffer,
-    const double *__restrict__ bootstrapping_key, int iteration, G &group,
-    bool support_dsm = false) {
-  const uint32_t polynomial_size = params::degree;
-  const uint32_t glwe_dimension = gridDim.y - 1;
-  const uint32_t level_count = gridDim.z;
-
-  // The first product is used to initialize level_join_buffer
-  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-
-  // Continues multiplying fft by every polynomial in that particular bsk level
-  // Each y-block accumulates in a different polynomial at each iteration
-  auto bsk_slice = get_ith_mask_kth_block_128(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
-      glwe_dimension, level_count);
-  for (int j = 0; j < glwe_dimension + 1; j++) {
-    int idx = (j + this_block_rank) % (glwe_dimension + 1);
-
-    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
-    auto buffer_slice = get_join_buffer_element_128<G>(
-        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_product_accumulate_in_fourier_domain_128<params>(
-        buffer_slice, fft, bsk_poly, j == 0);
-    group.sync();
-  }
-
-  // -----------------------------------------------------------------
-  // All blocks are synchronized here; after this sync, level_join_buffer has
-  // the values needed from every other block
-
-  // accumulate rest of the products into fft buffer
-  for (int l = 0; l < level_count; l++) {
-    auto cur_src_acc = get_join_buffer_element_128<G>(
-        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
-                                                        l == 0);
-  }
-
-  __syncthreads();
-}
-
 template <typename Torus>
 void execute_pbs_async(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -279,7 +223,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
                         uint32_t level_count, uint32_t grouping_factor,
                         uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
                         bool allocate_gpu_memory, bool allocate_ms_array,
-                         uint64_t &size_tracker) {
+                         uint64_t *size_tracker) {
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
@@ -287,7 +231,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    case MULTI_BIT:
      PANIC("Error: 32-bit multibit PBS is not supported.\n")
    case CLASSICAL:
-      size_tracker = scratch_cuda_programmable_bootstrap_32(
+      *size_tracker = scratch_cuda_programmable_bootstrap_32(
          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, level_count, input_lwe_ciphertext_count,
          allocate_gpu_memory, allocate_ms_array);
@@ -302,12 +246,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    case MULTI_BIT:
      if (grouping_factor == 0)
        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
-      size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
+      *size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
      break;
    case CLASSICAL:
-      size_tracker = scratch_cuda_programmable_bootstrap_64(
+      *size_tracker = scratch_cuda_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, level_count, input_lwe_ciphertext_count,
          allocate_gpu_memory, allocate_ms_array);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
@@ -8,9 +8,9 @@ execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
                        uint32_t polynomial_size, uint32_t level_count,
                        uint32_t input_lwe_ciphertext_count,
                        bool allocate_gpu_memory, bool allocate_ms_array,
-                        uint64_t &size_tracker_on_gpu) {
+                        uint64_t *size_tracker_on_gpu) {
  // The squash noise function receives as input 64-bit integers
-  size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
+  *size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
      polynomial_size, level_count, input_lwe_ciphertext_count,
      allocate_gpu_memory, allocate_ms_array);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Guillermo Oyarzun	35b3430404	chore(gpu): fix some pke bugs	2025-06-19 12:22:39 +02:00
Guillermo Oyarzun	a56a8107ab	chore(gpu): reduce the pool size	2025-06-17 13:20:58 +02:00
Guillermo Oyarzun	b322708cbd	chore(gpu): add drift to pke gpu tests	2025-06-16 17:36:11 +02:00
Guillermo Oyarzun	25da5ef721	chore(gpu): add pke gpu tests	2025-06-16 15:17:54 +02:00
Guillermo Oyarzun	43b95aaf7c	chore(gpu): fix input_br variance formula for multi-bit	2025-06-16 12:27:27 +02:00
Guillermo Oyarzun	cfc19d3bd2	chore(gpu): correct ms formula in pbs128	2025-06-16 12:27:27 +02:00
Guillermo Oyarzun	51427fc9ae	chore(gpu): add multiplication and extra logs to pbs128 test	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	a45baaa3d0	chore(gpu): fixes after rebase	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	a24adf528b	chore(gpu): use the m128 params	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	3ddf9bdba6	chore(gpu): fix pbs128 after rebasing	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	de4902fb9f	chore(gpu): add noise squashing params without using it	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	b89bca0d13	chore(gpu): fix pbs128 with old params	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	f0dc0e18ab	chore(gpu): add pbs128	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	98ca66581e	chore(gpu): fix errors after rebasing	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	c94ccc3a23	feat(gpu): fix bugs and add extra logs	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	7e6573a1d2	feat(gpu): fix compression tests after rebase	2025-06-16 12:27:23 +02:00
Guillermo Oyarzun	94ff21b089	feat(gpu): add noise checks with multi-bit pbs	2025-06-16 12:27:23 +02:00
Guillermo Oyarzun	71420f0d92	feat(gpu): add noise checks	2025-06-16 12:27:23 +02:00
Arthur Meyre	d81bd4ebd6	wip: long pfail runs	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	83d1d6a46c	wip: fix packing ks noise formula	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	ac693f97e1	wip: remove check that makes compression pfail test fail	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	d06656cfb4	wip: remove check that makes compression pfail test fail	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	472ea682ae	wip: update noise formulas	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	258524f5e2	chore(zk): add noise tests for zkv1	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	6b98865515	fix: new param naming	2025-06-16 12:27:20 +02:00
Arthur Meyre	e47731b1ee	fix(shortint): fix sample count in pbs128 pfail	2025-06-16 12:27:20 +02:00
Arthur Meyre	d1b9bc676d	test(shortint): add variance check after KS in classic PBS AP	2025-06-16 12:27:19 +02:00
Arthur Meyre	405952e323	test(shortint): add normality check after KS in classic AP	2025-06-16 12:27:19 +02:00
Arthur Meyre	7556a8e05f	fix(test): fix slighlty wrong log message	2025-06-16 12:27:19 +02:00
Arthur Meyre	ebda1426e4	fix(test): fix test parameters for PBS 128 Disable the bound check as our computation disagrees with the RO	2025-06-16 12:27:18 +02:00
Arthur Meyre	da91075b26	test(shortint): add pfail estimation after ms in compression	2025-06-16 12:27:18 +02:00
Arthur Meyre	f95eb2cf2c	wip: re-exported pbs 128 with symbolic mantissa - add pbs 128 params with a mantissa setting - noise with this settings is in line with RO prediction	2025-06-16 12:27:17 +02:00
Arthur Meyre	45da14c7dd	wip: add pbs 128 tests (noise and pfail)	2025-06-16 12:27:16 +02:00
Arthur Meyre	75e03ae800	feat(core): add pbs 128 noise formulas	2025-06-16 12:27:16 +02:00
Arthur Meyre	9976cbe1f2	test(shortint): add pfail measurement for full compression + AP	2025-06-16 12:27:15 +02:00
Arthur Meyre	a9006486e8	test(shortint): slightly change decryption logic to also get the padding bit out	2025-06-16 12:27:14 +02:00
Arthur Meyre	7f8778f178	test(shortint): add pfail check for first part of compression	2025-06-16 12:27:14 +02:00
Arthur Meyre	370e4ae2e6	test(shortint): add first part of the compression AP	2025-06-16 12:27:13 +02:00
Arthur Meyre	b6db9d8ba0	chore: rename helper function to match test usage	2025-06-16 12:27:12 +02:00
Arthur Meyre	54b139b1b5	feat(core): add noise formula for packing keyswitch	2025-06-16 12:27:12 +02:00
Arthur Meyre	d3ccf08f2c	test(shortint): add pfail test for PKE -> KS -> (DP -> KS Compute) -> MS	2025-06-16 12:27:11 +02:00
Arthur Meyre	6f1492766f	refactor(test): also accept PKE -> KS to big -> DP -> KS -> MS	2025-06-16 12:27:10 +02:00
Arthur Meyre	03fa607209	test(shortint): add noise check for PKE -> KS to small key + MS	2025-06-16 12:27:09 +02:00
Arthur Meyre	29ab6c0709	wip: hotfix for borrow mut error, this needs a design pass/reflection	2025-06-16 12:27:09 +02:00
Arthur Meyre	1c0b428cd3	feat: add noise formulas for TUniform 132 bits of security	2025-06-16 12:27:08 +02:00
Arthur Meyre	dab2d39749	test(hl): add noise check for CompactPublicKey encryption w/ TUniform param	2025-06-16 12:27:07 +02:00
Arthur Meyre	ea81ef5d15	test(shortint): add pfail test for the classic AP	2025-06-16 12:27:06 +02:00
Arthur Meyre	57a7a5a084	refactor: prepare code factorization for noise and pfail	2025-06-16 12:27:06 +02:00
Arthur Meyre	9010ded3d5	test: add shortint atomic pattern noise measurement	2025-06-16 12:27:05 +02:00
Arthur Meyre	1ebd2848ad	test(hl): test encryption noise of FheUint ciphertext in HL API	2025-06-16 12:27:04 +02:00
Nicolas Sarlin	6a1a024e6d	chore(zk)!: store inside the pke params the supported zk scheme BREAKING_CHANGE: - Zk for compact PKE now requires dedicated encryption parameters	2025-06-16 12:27:04 +02:00