chore(gpu): fix some pke bugs

chore(gpu): reduce the pool size
chore(gpu): add drift to pke gpu tests
2026-04-28 03:01:21 -04:00 · 2025-06-19 12:22:39 +02:00 · 2025-06-17 13:20:58 +02:00 · 2025-06-16 17:36:11 +02:00 · 2025-06-16 15:17:54 +02:00 · 2025-06-16 12:27:27 +02:00
607 changed files with 20315 additions and 27114 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1 @@
 *.hpu filter=lfs diff=lfs merge=lfs -text
-*.bcode filter=lfs diff=lfs merge=lfs -text
-*.cbor filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -71,26 +71,45 @@ jobs:
        with:
          toolchain: stable

-      # Cache key is an aggregated hash of lfs files hashes
-      - name: Get LFS data sha
-        id: hash-lfs-data
+      - name: Use specific data branch
+        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
+        env:
+          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
        run: |
-          SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data | sha256sum | cut -d' ' -f1)
+          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
+
+      - name: Get backward compat branch
+        id: backward_compat_branch
+        run: |
+          BRANCH="$(make backward_compat_branch)"
+          echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
+
+      - name: Get backward compat branch head SHA
+        id: backward_compat_sha
+        run: |
+          SHA=$(git ls-remote "${REPO_URL}" refs/heads/"${BACKWARD_COMPAT_BRANCH}" | awk '{print $1}')
          echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
+        env:
+          REPO_URL: "https://github.com/zama-ai/tfhe-backward-compat-data"
+          BACKWARD_COMPAT_BRANCH: ${{ steps.backward_compat_branch.outputs.branch }}

      - name: Retrieve data from cache
        id: retrieve-data-cache
        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: |
-            utils/tfhe-backward-compat-data/**/*.cbor
-            utils/tfhe-backward-compat-data/**/*.bcode
-          key: ${{ steps.hash-lfs-data.outputs.sha }}
+          path: tests/tfhe-backward-compat-data
+          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}

-      - name: Pull test data
+      - name: Clone test data
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
-        run: |
-          make pull_backward_compat_data
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+          repository: zama-ai/tfhe-backward-compat-data
+          path: tests/tfhe-backward-compat-data
+          lfs: 'true'
+          ref: ${{ steps.backward_compat_branch.outputs.branch }}

      - name: Run backward compatibility tests
        run: |
@@ -101,10 +120,8 @@ jobs:
        continue-on-error: true
        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: |
-            utils/tfhe-backward-compat-data/**/*.cbor
-            utils/tfhe-backward-compat-data/**/*.bcode
-          key: ${{ steps.hash-lfs-data.outputs.sha }}
+          path: tests/tfhe-backward-compat-data
+          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}

      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -103,7 +103,7 @@ jobs:
    name: Unsigned integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -104,7 +104,7 @@ jobs:
    name: Signed integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,7 +31,6 @@ on:
          - ks
          - ks_pbs
          - integer_zk
-          - hlapi_noise_squash
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -10,16 +10,37 @@ on:
 permissions: {}

 jobs:
-  run-benchmarks-8-h100-sxm5-integer:
-    name: Run integer benchmarks (8xH100-SXM5)
+  run-benchmarks-1-h100:
+    name: Run integer benchmarks (1xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: integer,integer_multi_bit
+      op_flavor: default
+      bench_type: latency
+      all_precisions: true
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-2-h100:
+    name: Run integer benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
      command: integer_multi_bit
      op_flavor: default
-      bench_type: both
+      bench_type: latency
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -31,16 +52,16 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-8-h100-sxm5-integer-compression:
-    name: Run integer compression benchmarks (8xH100-SXM5)
+  run-benchmarks-8-h100:
+    name: Run integer benchmarks (8xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
-      command: integer_compression
+      profile: multi-h100
+      hardware_name: n3-H100x8
+      command: integer_multi_bit
      op_flavor: default
-      bench_type: both
+      bench_type: latency
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -52,37 +73,16 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-8-h100-sxm5-integer-zk:
-    name: Run integer zk benchmarks (8xH100-SXM5)
+  run-benchmarks-l40:
+    name: Run integer benchmarks (L40)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
-      command: integer_zk
+      profile: l40
+      hardware_name: n3-L40x1
+      command: integer_multi_bit,integer_compression,pbs,ks
      op_flavor: default
-      bench_type: both
-      all_precisions: true
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-8-h100-sxm5-noise-squash:
-    name: Run integer zk benchmarks (8xH100-SXM5)
-    if: github.repository == 'zama-ai/tfhe-rs'
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: multi-h100-sxm5
-      hardware_name: n3-H100x8-SXM5
-      command: hlapi_noise_squash
-      op_flavor: default
-      bench_type: both
+      bench_type: latency
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -33,7 +33,6 @@ jobs:
        with:
          fetch-depth: 0
          persist-credentials: 'false'
-          lfs: true
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Get benchmark details
@@ -62,7 +61,6 @@ jobs:

      - name: Run benchmarks
        run: |
-          make pull_hpu_files
          make bench_integer_hpu
          make bench_hlapi_erc20_hpu

--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute FFT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute NTT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+      group: ${{ github.workflow_ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -49,14 +49,6 @@ jobs:
          mv linelint-linux-amd64 /usr/local/bin/linelint
          make check_newline

-      # This is needed for the ws tests clippy checks
-      - name: Use specific data branch
-        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
-        env:
-          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
-        run: |
-          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
-
      - name: Run pcc checks
        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

 permissions:
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

 permissions:
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -3,15 +3,14 @@ name: Check commit and PR compliance
 on:
  pull_request:

-permissions: {}
+permissions:
+  contents: read
+  pull-requests: read # Permission needed to scan commits in a pull-request

 jobs:
  check-commit-pr:
    name: Check commit and PR
    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      pull-requests: write # Permission needed to scan commits in a pull-request and write issue comment
    steps:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -0,0 +1,63 @@
+name: Close or Merge corresponding PR on the data repo
+
+# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
+
+env:
+  DATA_REPO: zama-ai/tfhe-backward-compat-data
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+  CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
+
+# only trigger on pull request closed events
+on:
+  pull_request:
+    types: [ closed ]
+
+permissions: {}
+
+jobs:
+  auto_close_job:
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
+    runs-on: ubuntu-latest
+    env:
+      GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}  # Needed for gh CLI commands
+    steps:
+    - name: Fetch PR number
+      run: |
+        PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
+        echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"
+
+    - name: Comment on the PR to indicate the reason of the close
+      run: |
+        gh pr comment "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
+      env:
+        REPO: ${{ github.repository }}
+        EVENT_NUMBER: ${{ github.event.number }}
+
+    - name: Merge the Pull Request in the data repo
+      if: ${{ github.event.pull_request.merged }}
+      run: |
+        gh pr merge "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --rebase \
+        --delete-branch
+
+    - name: Close the Pull Request in the data repo
+      if: ${{ !github.event.pull_request.merged }}
+      run: |
+        gh pr close "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --delete-branch
+
+    - name: Slack Notification
+      if: ${{ always() && job.status == 'failure' }}
+      continue-on-error: true
+      uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+      env:
+        SLACK_COLOR: ${{ job.status }}
+        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -137,7 +137,7 @@ jobs:
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE NO_BIG_PARAMS_GPU=TRUE make test_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci

      - name: Run user docs tests
        run: |
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -11,7 +11,6 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  IS_PR: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -19,8 +18,6 @@ on:
  schedule:
    # Nightly tests will be triggered each evening 8p.m.
    - cron: "0 20 * * *"
-  pull_request:
-

 permissions:
  contents: read
@@ -81,11 +78,7 @@ jobs:

      - name: Run tests
        run: |
-          if [[ "${IS_PR}" == "true" ]]; then
-            make test_integer_short_run_gpu
-          else
-            make test_integer_long_run_gpu
-          fi
+          make test_integer_long_run_gpu

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -25,6 +25,9 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -25,6 +25,9 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+  schedule:
+    # Nightly tests @ 1AM after each work day
+    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true


--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -93,9 +93,6 @@ jobs:
            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
-        env:
-          GCC_VERSION: ${{ matrix.gcc }}
-
      - name: Prepare package
        run: |
          cargo package -p tfhe-cuda-backend
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -21,7 +21,7 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: git-sync
-        uses: valtech-sd/git-sync@e734cfe9485a92e720eac5af8a4555dde5fecf88
+        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "main"
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -1,26 +0,0 @@
-name: 'Close unverified PRs'
-on:
-  schedule:
-    - cron: '30 1 * * *'
-
-permissions: {}
-
-jobs:
-  stale:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: read
-      pull-requests: write
-    steps:
-      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
-        with:
-          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
-          days-before-stale: 2
-          days-before-close: 0
-          # We are not interested in suppressing issues so have a currently non existent label
-          # if we ever accept issues to become stale/closable this label will be the signal for that
-          only-issue-labels: can-be-auto-closed
-          # Only unverified PRs are an issue
-          exempt-pr-labels: cla-signed
-          # We don't want people commenting to keep an unverified PR
-          ignore-updates: true
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -10,7 +10,6 @@ ignore:
  - keys
  - coverage
  - utils/tfhe-lints/ui/main.stderr
-  - utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated

 rules:
  # checks if file ends in a newline character
--- a/14
+++ b/14
@@ -1,28 +1,18 @@
 # Specifying a path without code owners means that path won't have owners and is akin to a negation
 # i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
 # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
-
-/backends/tfhe-cuda-backend/            @agnesLeroy
-/backends/tfhe-hpu-backend/             @zama-ai/hardware
-
-/tfhe/examples/hpu                      @zama-ai/hardware
-
 /tfhe/src/core_crypto/                  @IceTDrinker
-/tfhe/src/core_crypto/gpu               @agnesLeroy
-/tfhe/src/core_crypto/hpu               @zama-ai/hardware
+/tfhe/src/core_crypto/gpu

 /tfhe/src/shortint/                     @mayeul-zama

 /tfhe/src/integer/                      @tmontaigu
-/tfhe/src/integer/gpu                   @agnesLeroy
-/tfhe/src/integer/hpu                   @zama-ai/hardware
+/tfhe/src/integer/gpu

 /tfhe/src/high_level_api/               @tmontaigu

 /Makefile                               @IceTDrinker @soonum

-/mockups/tfhe-hpu-mockup                @zama-ai/hardware
-
 /.github/                               @soonum

 /CODEOWNERS                             @IceTDrinker
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -170,8 +170,6 @@ On the contrary, these changes are *not* data breaking:
 * Renaming a type (unless it implements the `Named` trait).
 * Adding a variant to the end of an enum.

-Historical data from previous TFHE-rs versions are stored inside `utils/tfhe-backward-compat-data`. They are used to check on every PR that backward compatibility has been preserved.
-
 ## Example: adding a field

 Suppose you want to add an i32 field to a type named `MyType`. The original type is defined as:
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
 ]

 exclude = [
-    "utils/tfhe-backward-compat-data",
+    "tests/backward_compatibility_tests",
    "utils/tfhe-lints",
    "apps/trivium",
 ]
--- a/102
+++ b/102
@@ -22,7 +22,10 @@ BENCH_TYPE?=latency
 BENCH_PARAM_TYPE?=classical
 BENCH_PARAMS_SET?=default
 NODE_VERSION=22.6
-BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
+BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
+BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
+BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
+BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
 WASM_PACK_VERSION="0.13.1"
 # We are kind of hacking the cut here, the version cannot contain a quote '"'
@@ -156,23 +159,23 @@ install_tarpaulin: install_rs_build_toolchain

 .PHONY: install_cargo_dylint # Install custom tfhe-rs lints
 install_cargo_dylint:
-	cargo install --locked cargo-dylint dylint-link
+	cargo install cargo-dylint dylint-link

 .PHONY: install_typos_checker # Install typos checker
 install_typos_checker: install_rs_build_toolchain
 	@typos --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked typos-cli || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
 	( echo "Unable to install typos-cli, unknown error." && exit 1 )

 .PHONY: install_zizmor # Install zizmor workflow security checker
 install_zizmor: install_rs_build_toolchain
 	@zizmor --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked zizmor --version ~1.9 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
 	( echo "Unable to install zizmor, unknown error." && exit 1 )

-.PHONY: install_cargo_cross # Install cross for big endian tests
+.PHONY: install_cargo_cross # Install custom tfhe-rs lints
 install_cargo_cross: install_rs_build_toolchain
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked cross
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross

 .PHONY: setup_venv # Setup Python virtualenv for wasm tests
 setup_venv:
@@ -249,9 +252,6 @@ install_mlc: install_rs_build_toolchain
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt

 .PHONY: fmt_js # Format javascript code
 fmt_js: check_nvm_installed
@@ -273,9 +273,6 @@ fmt_c_tests:
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
-	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check

 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
@@ -486,22 +483,10 @@ clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p param_dedup -- --no-deps -D warnings

-.PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
-clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
-	@# Some old crates are x86 specific, only run in that case
-	@if uname -a | grep -q x86; then \
-		RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
-			-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all-targets \
-			-- --no-deps -D warnings; \
-	else \
-		echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
-	fi
-
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
-clippy_backward_compat_data
+clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -517,12 +502,6 @@ clippy_hpu_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-hpu-backend -- --no-deps -D warnings

-.PHONY: clippy_hpu_mockup # Run clippy lints on tfhe-hpu-mockup
-clippy_hpu_mockup: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--all-targets \
-		-p tfhe-hpu-mockup -- --no-deps -D warnings
-
 .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
 check_rust_bindings_did_not_change:
 	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -535,9 +514,6 @@ check_rust_bindings_did_not_change:
 tfhe_lints: install_cargo_dylint
 	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe --no-deps -- \
 		--features=boolean,shortint,integer,strings,zk-pok
-	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe-zk-pok --no-deps -- \
-		--features=experimental
-

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -679,14 +655,6 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

-.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
-test_integer_gpu_debug: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release_lto_off \
-		--features=integer,gpu-debug -vv -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1 --nocapture
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
-		--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
-
-
 .PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
 test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -695,12 +663,6 @@ test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
 		--tfhe-package "$(TFHE_SPEC)" --backend "gpu"

-.PHONY: test_integer_short_run_gpu # Run the long run integer tests on the gpu backend
-test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
-	TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE \
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture
-
 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -1063,11 +1025,16 @@ test_tfhe_lints: install_cargo_dylint
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
 .PHONY: test_backward_compatibility_ci
 test_backward_compatibility_ci: install_rs_build_toolchain
-	TFHE_BACKWARD_COMPAT_DATA_DIR="../$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tests/$(BACKWARD_COMPAT_DATA_DIR)\"" \
 		--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
-test_backward_compatibility: pull_backward_compat_data test_backward_compatibility_ci
+test_backward_compatibility: tests/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
+
+.PHONY: backward_compat_branch # Prints the required backward compatibility branch
+backward_compat_branch:
+	@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"

 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
@@ -1112,10 +1079,6 @@ check_intra_md_links: install_mlc
 check_md_links: install_mlc
 	mlc --match-file-extension tfhe/docs

-.PHONY: check_doc_paths_use_dash # Check paths use "-" instead of "_" in docs for gitbook compatibility
-check_doc_paths_use_dash:
-	python3 ./scripts/check_doc_paths_use_dash.py
-
 .PHONY: check_parameter_export_ok # Checks exported "current" shortint parameter module is correct
 check_parameter_export_ok:
 	python3 ./scripts/check_current_param_export.py
@@ -1473,20 +1436,6 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

-.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
-bench_hlapi_noise_squash: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
-
-.PHONY: bench_hlapi_noise_squash_gpu # Run benchmarks for noise squash operation on GPU
-bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench hlapi-noise-squash \
-	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
-
 #
 # Utility tools
 #
@@ -1544,13 +1493,11 @@ write_params_to_file: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
 	--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache

-.PHONY: pull_backward_compat_data # Pull the data files needed for backward compatibility tests
-pull_backward_compat_data:
-	./scripts/pull_lfs_data.sh $(BACKWARD_COMPAT_DATA_DIR)
+.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
+clone_backward_compat_data:
+	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tests/$(BACKWARD_COMPAT_DATA_DIR)

-.PHONY: pull_hpu_files # Pull the hpu files
-pull_hpu_files:
-	./scripts/pull_lfs_data.sh backends/tfhe-hpu-backend/
+tests/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data

 #
 # Real use case examples
@@ -1576,8 +1523,7 @@ sha256_bool: install_rs_check_toolchain

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
-clippy_all check_compile_tests test_tfhe_lints \
+check_md_docs_are_tested check_intra_md_links clippy_all check_compile_tests test_tfhe_lints \
 tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
@@ -1585,11 +1531,11 @@ pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
 clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu

 .PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
-pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
+pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash clippy_fast check_compile_tests
+check_md_docs_are_tested clippy_fast check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt fmt_js
--- a/README.md
+++ b/README.md
@@ -18,7 +18,6 @@
  <a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
  <a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
  <a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
-  <a href="https://slsa.dev"><img alt="SLSA 3" src="https://slsa.dev/images/gh-badge-level3.svg" /></a>
 </p>

 ## About
@@ -149,7 +148,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -137,10 +137,10 @@ its setup a little more intricate.
 Example code:
 ```rust
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,9 +1,9 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,9 +1,9 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.11.0"
+version = "0.10.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -19,4 +19,3 @@ bindgen = "0.71"
 [features]
 experimental-multi-arch = []
 profile = []
-debug = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -53,10 +53,6 @@ fn main() {
            cmake_config.define("USE_NVTOOLS", "OFF");
        }

-        if cfg!(feature = "debug") {
-            cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
-        }
-
        // Build the CMake project
        let dest = cmake_config.build();
        println!("cargo:rustc-link-search=native={}", dest.display());
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -52,8 +52,6 @@ endif()

 if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Release)
-else()
-  message("Building CUDA backend in ${CMAKE_BUILD_TYPE}")
 endif()

 # Add OpenMP support
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -37,6 +37,17 @@ void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
    uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
+void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
+                                      void *lwe_array_out, void *lwe_array_in,
+                                      uint32_t size, uint32_t log_modulus,
+                                      uint32_t degree,
+                                      uint32_t grouping_factor);
+
+void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
+                                       void *lwe_array_out, void *lwe_array_in,
+                                       uint32_t size, uint32_t log_modulus,
+                                       uint32_t degree,
+                                       uint32_t grouping_factor);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -49,13 +49,12 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index);

 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
+                                           uint64_t *size_tracker,
                                           bool allocate_gpu_memory);

 void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
-uint64_t cuda_device_total_memory(uint32_t gpu_index);

 void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 uint64_t size,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -40,6 +40,11 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(
 void cleanup_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);
-}

+void cuda_integer_extract_glwe_64(
+    void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
+    void const *glwe_list, uint32_t const glwe_index,
+    uint32_t const log_modulus, uint32_t const polynomial_size,
+    uint32_t const glwe_dimension, uint32_t const body_count);
+}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -20,7 +20,7 @@ template <typename Torus> struct int_compression {
                  uint32_t gpu_count, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                  uint64_t &size_tracker) {
+                  uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
    this->lwe_per_glwe = lwe_per_glwe;
@@ -38,7 +38,7 @@ template <typename Torus> struct int_compression {
        lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
        gpu_indexes[0], size_tracker, allocate_gpu_memory);

-    size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    *size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
        streams[0], gpu_indexes[0], &fp_ks_buffer,
        compression_params.small_lwe_dimension,
        compression_params.glwe_dimension, compression_params.polynomial_size,
@@ -76,7 +76,7 @@ template <typename Torus> struct int_decompression {
                    int_radix_params compression_params,
                    uint32_t num_radix_blocks, uint32_t body_count,
                    uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                    uint64_t &size_tracker) {
+                    uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -48,34 +48,6 @@ typedef struct {
  uint32_t lwe_dimension;
 } CudaRadixCiphertextFFI;

-typedef struct {
-  uint64_t const *chosen_multiplier_has_at_least_one_set;
-  uint64_t const *decomposed_chosen_multiplier;
-
-  uint32_t const num_scalars;
-  uint32_t const active_bits;
-  uint64_t const shift_pre;
-  uint32_t const shift_post;
-  uint32_t const ilog2_chosen_multiplier;
-  uint32_t const chosen_multiplier_num_bits;
-
-  bool const is_chosen_multiplier_zero;
-  bool const is_abs_chosen_multiplier_one;
-  bool const is_chosen_multiplier_negative;
-  bool const is_chosen_multiplier_pow2;
-  bool const chosen_multiplier_has_more_bits_than_numerator;
-  // if signed: test if chosen_multiplier >= 2^{num_bits - 1}
-  bool const is_chosen_multiplier_geq_two_pow_numerator;
-
-  uint32_t const ilog2_divisor;
-
-  bool const is_divisor_zero;
-  bool const is_abs_divisor_one;
-  bool const is_divisor_negative;
-  bool const is_divisor_pow2;
-  bool const divisor_has_more_bits_than_numerator;
-} CudaScalarDivisorFFI;
-
 uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -423,8 +395,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -443,8 +414,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -577,6 +547,27 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
                              void *const *streams,
                              uint32_t const *gpu_indexes);

+uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
+    bool allocate_ms_array);
+
+void cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
+    uint64_t rhs, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars);
+
+void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
 uint64_t scratch_cuda_apply_noise_squashing_kb(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -599,6 +590,9 @@ void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
                                           uint32_t const *gpu_indexes,
                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void);
+void cuda_small_scalar_multiplication_integer_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t scalar);

 uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -621,115 +615,5 @@ void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
                                                 uint32_t const *gpu_indexes,
                                                 uint32_t gpu_count,
                                                 int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array);
-
-void cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi);
-
-void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t num_additional_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
-
-void cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
-    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
-    void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
-
-void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array);
-
-void cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
-
-void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array);
-
-void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    void const *clear_blocks, void const *h_clear_blocks,
-    uint32_t num_clear_blocks);
-
-void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array);
-
-void cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    uint32_t numerator_bits);
-
-void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -65,6 +65,9 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
    void const *lwe_array_in, const uint64_t plaintext_in,
    const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);
+void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void const *plaintext_array_in, const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
 }
-
 #endif // CUDA_LINALG_H_
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -66,9 +66,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
-uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
-    uint32_t polynomial_size);
-template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
    uint32_t polynomial_size);
 template <typename Torus>
@@ -98,12 +95,8 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(

 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size,
-                            uint64_t full_sm_keybundle);
-template <typename Torus, class params>
-uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
-                                uint32_t polynomial_size,
-                                uint64_t full_sm_keybundle);
+                            uint32_t polynomial_size);
+
 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
@@ -122,7 +115,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             uint64_t &size_tracker) {
+             uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);

@@ -288,146 +281,4 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  }
 };

-template <typename InputTorus>
-struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
-  int8_t *d_mem_keybundle = NULL;
-  int8_t *d_mem_acc_step_one = NULL;
-  int8_t *d_mem_acc_step_two = NULL;
-  int8_t *d_mem_acc_cg = NULL;
-  int8_t *d_mem_acc_tbc = NULL;
-  uint32_t lwe_chunk_size;
-  double *keybundle_fft;
-  __uint128_t *global_accumulator;
-  double *global_join_buffer;
-
-  PBS_VARIANT pbs_variant;
-  bool gpu_memory_allocated;
-
-  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-                 uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
-                 bool allocate_gpu_memory, uint64_t &size_tracker) {
-    gpu_memory_allocated = allocate_gpu_memory;
-    cuda_set_device(gpu_index);
-
-    this->pbs_variant = pbs_variant;
-    this->lwe_chunk_size = lwe_chunk_size;
-    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-
-    // default
-    uint64_t full_sm_keybundle =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
-            __uint128_t>(polynomial_size);
-    uint64_t full_sm_accumulate_step_one =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
-            __uint128_t>(polynomial_size);
-    uint64_t full_sm_accumulate_step_two =
-        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
-            __uint128_t>(polynomial_size);
-    uint64_t partial_sm_accumulate_step_one =
-        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
-            __uint128_t>(polynomial_size);
-    // cg
-    uint64_t full_sm_cg_accumulate =
-        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
-            polynomial_size);
-    uint64_t partial_sm_cg_accumulate =
-        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
-            __uint128_t>(polynomial_size);
-
-    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
-                                (glwe_dimension + 1) * (glwe_dimension + 1) *
-                                level_count;
-    auto num_blocks_acc_step_one =
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
-    auto num_blocks_acc_step_two =
-        input_lwe_ciphertext_count * (glwe_dimension + 1);
-    auto num_blocks_acc_cg =
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
-
-    // Keybundle
-    if (max_shared_memory < full_sm_keybundle)
-      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
-          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
-          size_tracker, allocate_gpu_memory);
-
-    switch (pbs_variant) {
-    case PBS_VARIANT::CG:
-      // Accumulator CG
-      if (max_shared_memory < partial_sm_cg_accumulate)
-        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
-            size_tracker, allocate_gpu_memory);
-      else if (max_shared_memory < full_sm_cg_accumulate)
-        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
-            size_tracker, allocate_gpu_memory);
-      break;
-    case PBS_VARIANT::DEFAULT:
-      // Accumulator step one
-      if (max_shared_memory < partial_sm_accumulate_step_one)
-        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
-      else if (max_shared_memory < full_sm_accumulate_step_one)
-        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
-
-      // Accumulator step two
-      if (max_shared_memory < full_sm_accumulate_step_two)
-        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
-            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
-            gpu_index, size_tracker, allocate_gpu_memory);
-      break;
-    default:
-      PANIC("Cuda error (PBS): unsupported implementation variant.")
-    }
-
-    keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
-        num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
-        stream, gpu_index, size_tracker, allocate_gpu_memory);
-    global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
-        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
-            sizeof(__uint128_t),
-        stream, gpu_index, size_tracker, allocate_gpu_memory);
-    global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
-        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
-            (polynomial_size / 2) * 4 * sizeof(double),
-        stream, gpu_index, size_tracker, allocate_gpu_memory);
-  }
-
-  void release(cudaStream_t stream, uint32_t gpu_index) {
-
-    if (d_mem_keybundle)
-      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
-                                         gpu_memory_allocated);
-    switch (pbs_variant) {
-    case DEFAULT:
-      if (d_mem_acc_step_one)
-        cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
-                                           gpu_index, gpu_memory_allocated);
-      if (d_mem_acc_step_two)
-        cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
-                                           gpu_index, gpu_memory_allocated);
-      break;
-    case CG:
-      if (d_mem_acc_cg)
-        cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
-                                           gpu_memory_allocated);
-      break;
-    default:
-      PANIC("Cuda error (PBS): unsupported implementation variant.")
-    }
-
-    cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
-                                       gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
-                                       gpu_memory_allocated);
-  }
-};
-
 #endif // CUDA_MULTI_BIT_UTILITIES_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -90,7 +90,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
             uint32_t glwe_dimension, uint32_t polynomial_size,
             uint32_t level_count, uint32_t input_lwe_ciphertext_count,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             bool allocate_ms_array, uint64_t &size_tracker) {
+             bool allocate_ms_array, uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->uses_noise_reduction = allocate_ms_array;
@@ -240,10 +240,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

-template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
-
-template <typename InputTorus>
-struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
+template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
@@ -260,7 +257,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
                 uint32_t polynomial_size, uint32_t level_count,
                 uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
                 bool allocate_gpu_memory, bool allocate_ms_array,
-                 uint64_t &size_tracker) {
+                 uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -15,11 +15,6 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size, uint32_t grouping_factor);
-
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -38,25 +33,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
-
-void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride);
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
-                                                       const uint32_t gpu_index,
-                                                       int8_t **buffer);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -10,8 +10,8 @@ extern "C" {
 void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
                        void *lwe_array_out, const void *lwe_compact_array_in,
                        uint32_t lwe_dimension, uint32_t num_lwe,
-                        const uint32_t *lwe_compact_input_indexes,
-                        const uint32_t *output_body_id_per_compact_list);
+                        const void *lwe_compact_input_indexes,
+                        const void *output_body_id_per_compact_list);

 uint64_t scratch_cuda_expand_without_verification_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -27,7 +27,7 @@ template <typename Torus> struct zk_expand_mem {
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
                const bool *is_boolean_array, uint32_t num_compact_lists,
-                bool allocate_gpu_memory, uint64_t &size_tracker)
+                bool allocate_gpu_memory, uint64_t *size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
        casting_key_type(casting_key_type) {
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,6 +1,5 @@
 file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
-                                     pbs/programmable_bootstrap_multibit_128.cu)
+add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -1,5 +1,6 @@
 #include "ciphertext.cuh"
 #include "polynomial/parameters.cuh"
+#include "torus.cuh"

 void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
@@ -142,3 +143,29 @@ void cuda_glwe_sample_extract_128(
          "N's are powers of two in the interval [256..4096].")
  }
 }
+
+void cuda_modulus_switch_multi_bit_64(void *stream, uint32_t gpu_index,
+                                      void *lwe_array_out, void *lwe_array_in,
+                                      uint32_t size, uint32_t log_modulus,
+                                      uint32_t degree,
+                                      uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
+
+void cuda_modulus_switch_multi_bit_128(void *stream, uint32_t gpu_index,
+                                       void *lwe_array_out, void *lwe_array_in,
+                                       uint32_t size, uint32_t log_modulus,
+                                       uint32_t degree,
+                                       uint32_t grouping_factor) {
+
+  host_modulus_switch_multi_bit<__uint128_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<__uint128_t *>(lwe_array_out),
+      static_cast<__uint128_t *>(lwe_array_in), size, log_modulus, degree,
+      grouping_factor);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -38,16 +38,6 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 // Each thread in x are used to calculate one output.
 // threads in y are used to paralelize the lwe_dimension_in loop.
 // shared memory is used to store intermediate results of the reduction.
-// Note: To reduce register pressure we have slightly changed the algorithm,
-// the idea consists in calculating the negate value of the output. So, instead
-// of accumulating subtractions using -=, we accumulate additions using += in
-// the local_lwe_out. This seems to work better cause profits madd ops and save
-// some regs. For this to work, we need to negate the input
-// lwe_array_in[lwe_dimension_in], and negate back the output at the end to get
-// the correct results. Additionally, we split the calculation of the ksk offset
-// in two parts, a constant part is calculated before the loop, and a variable
-// part is calculated inside the loop. This seems to help with the register
-// pressure as well.
 template <typename Torus>
 __global__ void
 keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
@@ -70,7 +60,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);

    if (tid == lwe_dimension_out && threadIdx.y == 0) {
-      local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
+      local_lwe_out = block_lwe_array_in[lwe_dimension_in];
    }
    const Torus mask_mod_b = (1ll << base_log) - 1ll;

@@ -83,12 +73,12 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    for (int i = start_i; i < end_i; i++) {
      Torus state =
          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
-      uint32_t offset = i * level_count * (lwe_dimension_out + 1);
-      for (int j = 0; j < level_count; j++) {

+      for (int j = 0; j < level_count; j++) {
+        auto ksk_block =
+            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-        local_lwe_out +=
-            (Torus)ksk[tid + j * (lwe_dimension_out + 1) + offset] * decomposed;
+        local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
      }
    }

@@ -103,7 +93,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
          lwe_acc_out[shmem_index + offset * blockDim.x];
    }
    if (threadIdx.y == 0)
-      block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
+      block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
  }
 }

@@ -186,10 +176,10 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(
                             ? glwe_accumulator_size
                             : lwe_dimension * 2;

-  uint64_t size_tracker = 0;
+  uint64_t size_tracker;
  uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
  *fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
-      buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
+      buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -66,13 +66,6 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
  r = __ll2double_rn(x);
 }

-template <>
-__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
-                                                             double &r) {
-  // We truncate x
-  r = __ll2double_rn(static_cast<uint64_t>(x));
-}
-
 template <typename T>
 __device__ inline T init_decomposer_state(T input, uint32_t base_log,
                                          uint32_t level_count) {
@@ -302,4 +295,87 @@ __host__ void host_improve_noise_modulus_switch(
  check_cuda_error(cudaGetLastError());
 }

+template <typename Torus, class params>
+__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
+                                               uint32_t ggsw_idx,
+                                               uint32_t grouping_factor) {
+  Torus x = 0;
+  for (int i = 0; i < grouping_factor; i++) {
+    uint32_t mask_position = grouping_factor - (i + 1);
+    int selection_bit = (ggsw_idx >> mask_position) & 1;
+    x += selection_bit * lwe_array_group[i];
+  }
+
+  return modulus_switch(x, params::log2_degree + 1);
+}
+
+template <typename Torus, class params>
+__global__ void
+modulus_switch_multi_bit(Torus *array_out, const Torus *array_in, int size,
+                         uint32_t log_modulus, uint32_t grouping_factor) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    int num_monomials = 1 << grouping_factor;
+    int input_offset = tid * grouping_factor;
+    int output_offset = tid * (num_monomials - 1); // First monomial is skipped
+    for (int ggsw_idx = 1; ggsw_idx < num_monomials; ggsw_idx++) {
+      array_out[ggsw_idx - 1 + output_offset] =
+          calculates_monomial_degree<Torus, params>(&array_in[input_offset],
+                                                    ggsw_idx, grouping_factor);
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_modulus_switch_multi_bit(
+    cudaStream_t stream, uint32_t gpu_index, Torus *array_out, Torus *array_in,
+    int size, uint32_t log_modulus, uint32_t degree, uint32_t grouping_factor) {
+  cudaSetDevice(gpu_index);
+  int multibit_size = size / grouping_factor;
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(multibit_size, 1024, num_blocks, num_threads);
+  switch (degree) {
+  case 256:
+    modulus_switch_multi_bit<Torus, Degree<256>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 512:
+    modulus_switch_multi_bit<Torus, Degree<512>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 1024:
+    modulus_switch_multi_bit<Torus, Degree<1024>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 2048:
+    modulus_switch_multi_bit<Torus, Degree<2048>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 4096:
+    modulus_switch_multi_bit<Torus, Degree<4096>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 8192:
+    modulus_switch_multi_bit<Torus, Degree<8192>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  case 16384:
+    modulus_switch_multi_bit<Torus, Degree<16384>>
+        <<<num_blocks, num_threads, 0, stream>>>(
+            array_out, array_in, multibit_size, log_modulus, grouping_factor);
+    break;
+  default:
+    PANIC("Cuda error: unsupported polynomial size. Supported "
+          "N's are powers of two in the interval [256..16384].")
+  };
+
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -74,9 +74,10 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t &size_tracker,
+                                           uint64_t *size_tracker,
                                           bool allocate_gpu_memory) {
-  size_tracker += size;
+  if (size_tracker != nullptr)
+    *size_tracker += size;
  void *ptr = nullptr;
  if (!allocate_gpu_memory)
    return ptr;
@@ -105,9 +106,8 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                        uint32_t gpu_index) {
-  uint64_t size_tracker = 0;
-  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
-                                              size_tracker, true);
+  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index, nullptr,
+                                              true);
 }

 /// Check that allocation is valid
@@ -122,13 +122,6 @@ bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
  }
 }

-uint64_t cuda_device_total_memory(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
-  size_t total_mem = 0, free_mem = 0;
-  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
-  return total_mem;
-}
-
 /// Returns
 ///  false if Cooperative Groups is not supported.
 ///  true otherwise
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -234,29 +234,6 @@ __device__ void convert_u128_to_f128_as_torus(
  }
 }

-// params is expected to be full degree not half degree
-// same as convert_u128_to_f128_as_torus() but expects input to be on registers
-template <class params>
-__device__ void convert_u128_on_regs_to_f128_as_torus(
-    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
-    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
-
-  const double normalization = pow(2., -128.);
-  Index tid = threadIdx.x;
-  // #pragma unroll
-  for (Index i = 0; i < params::opt / 2; i++) {
-    auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
-    auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
-
-    out_re_hi[tid] = out_re.hi * normalization;
-    out_re_lo[tid] = out_re.lo * normalization;
-    out_im_hi[tid] = out_im.hi * normalization;
-    out_im_lo[tid] = out_im.lo * normalization;
-
-    tid += params::degree / params::opt;
-  }
-}
-
 template <class params>
 __device__ void
 convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -295,7 +272,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standard input into complex<128> represented by 4 double
+// converts standqard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
 // ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n,  im_lo_0, im_lo_1, ..., im_lo_n]
 template <class params>
@@ -314,7 +291,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standard input into complex<128> represented by 4 double
+// converts standqard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
 // re_lo_1, im_hi_1, im_lo_1,
 // ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
  if (is_signed) {
    *mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
                                         params, num_blocks,
-                                         allocate_gpu_memory, size_tracker);
+                                         allocate_gpu_memory, &size_tracker);
  }
  return size_tracker;
 }
@@ -53,8 +53,7 @@ __host__ void host_integer_abs_kb(
      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                       ct->num_radix_blocks, mem_ptr->params.message_modulus,
-                       mem_ptr->params.carry_modulus);
+                       ct->num_radix_blocks);

  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -61,7 +61,7 @@ __host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
                                         params, num_radix_blocks,
-                                         allocate_gpu_memory, size_tracker);
+                                         allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -15,48 +15,3 @@ void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
  host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
                                       gpu_indexes);
 }
-
-uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t num_additional_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_extend_radix_with_sign_msb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count,
-      (int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
-      num_blocks, num_additional_blocks, allocate_gpu_memory);
-}
-
-void cuda_extend_radix_with_sign_msb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
-    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
-    void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
-  host_extend_radix_with_sign_msb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
-      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
-      num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
-}
-
-void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
-                                                uint32_t const *gpu_indexes,
-                                                uint32_t gpu_count,
-                                                int8_t **mem_ptr_void) {
-
-  int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
-      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -33,62 +33,4 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
      input_start_lwe_index, input->num_radix_blocks);
 }

-template <typename Torus>
-__host__ uint64_t scratch_extend_radix_with_sign_msb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
-    const int_radix_params params, uint32_t num_radix_blocks,
-    uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      num_additional_blocks, allocate_gpu_memory, size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_extend_radix_with_sign_msb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI const *input,
-    int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
-    uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
-  if (num_additional_blocks == 0) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
-                                       input);
-    return;
-  }
-
-  const uint32_t input_blocks = input->num_radix_blocks;
-
-  if (input_blocks == 0) {
-    PANIC("Cuda error: input blocks cannot be zero");
-  }
-
-  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
-                                           0, input_blocks, input, 0,
-                                           input_blocks);
-
-  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                           mem_ptr->last_block, 0, 1, input,
-                                           input_blocks - 1, input_blocks);
-
-  host_apply_univariate_lut_kb(
-      streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
-      mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
-
-  for (uint32_t i = 0; i < num_additional_blocks; ++i) {
-    uint32_t dst_block_idx = input_blocks + i;
-
-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
-                                             dst_block_idx, dst_block_idx + 1,
-                                             mem_ptr->padding_block, 0, 1);
-  }
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -84,8 +84,7 @@ __host__ void host_integer_radix_cmux_kb(
                                   num_radix_blocks, 2 * num_radix_blocks);

  host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
-                       &mem_false, num_radix_blocks, params.message_modulus,
-                       params.carry_modulus);
+                       &mem_false, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
@@ -101,7 +100,7 @@ __host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_cmux_buffer<Torus>(
      streams, gpu_indexes, gpu_count, predicate_lut_f, params,
-      num_radix_blocks, allocate_gpu_memory, size_tracker);
+      num_radix_blocks, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -684,7 +684,7 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_comparison_buffer<Torus>(
      streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -89,3 +89,16 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_64(
      (int_decompression<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void cuda_integer_extract_glwe_64(
+    void *const *streams, uint32_t const *gpu_indexes, void *lwe_array_out,
+    void const *glwe_list, uint32_t const glwe_index,
+    uint32_t const log_modulus, uint32_t const polynomial_size,
+    uint32_t const glwe_dimension, uint32_t const body_count) {
+  auto casted_streams = (cudaStream_t *)(streams);
+
+  host_extract_mem_alloc_free<uint64_t>(
+      casted_streams[0], gpu_indexes[0], static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(glwe_list), glwe_index, log_modulus,
+      polynomial_size, glwe_dimension, body_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -231,6 +231,84 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  check_cuda_error(cudaGetLastError());
 }

+/// Extracts the glwe_index-nth GLWE ciphertext
+/// This function does not require memory allocation
+template <typename Torus>
+__host__ void host_extract_mem_alloc_free(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_array_out,
+    Torus const *array_in, const uint32_t glwe_index,
+    const uint32_t log_modulus, const uint32_t polynomial_size,
+    const uint32_t glwe_dimension, const uint32_t body_count_in) {
+  if (array_in == glwe_array_out)
+    PANIC("Cuda error: Input and output must be different");
+
+  cuda_set_device(gpu_index);
+
+  auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
+  auto num_glwes = (body_count_in + polynomial_size - 1) / polynomial_size;
+
+  // Compressed length of the compressed GLWE we want to extract
+  uint32_t body_count = 0;
+  if (body_count_in % polynomial_size == 0)
+    body_count = polynomial_size;
+  else if (glwe_index == num_glwes - 1)
+    body_count = body_count_in % polynomial_size;
+  else
+    body_count = polynomial_size;
+
+  auto initial_out_len = glwe_dimension * polynomial_size + body_count;
+  // Calculates how many bits this particular GLWE shall use
+  auto number_bits_to_unpack = initial_out_len * log_modulus;
+  auto nbits = sizeof(Torus) * 8;
+  auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+
+  // Calculates how many bits a full-packed GLWE shall use
+  number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
+  auto len = (number_bits_to_unpack + nbits - 1) / nbits;
+  // Uses that length to set the input pointer
+  auto chunk_array_in = array_in + glwe_index * len;
+  // Ensure the tail of the GLWE is zeroed
+  if (initial_out_len < glwe_ciphertext_size) {
+    auto zeroed_slice = glwe_array_out + initial_out_len;
+    cuda_memset_async(glwe_array_out, 0,
+                      (glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
+                      stream, gpu_index);
+  }
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, chunk_array_in,
+                                               log_modulus, initial_out_len);
+
+  // uint32_t body_count = std::min(body_count_in, polynomial_size);
+  // auto initial_out_len = glwe_dimension * polynomial_size + body_count;
+
+  // auto compressed_glwe_accumulator_size =
+  //     (glwe_dimension + 1) * polynomial_size;
+
+  // auto number_bits_to_unpack = compressed_glwe_accumulator_size *
+  // log_modulus; auto nbits = sizeof(Torus) * 8;
+  // // number_bits_to_unpack.div_ceil(Scalar::BITS)
+  // auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
+
+  // // We assure the tail of the glwe is zeroed
+  // auto zeroed_slice = glwe_array_out + initial_out_len;
+  // cuda_memset_async(zeroed_slice, 0,
+  //                   (polynomial_size - body_count) * sizeof(Torus),
+  //                   streams[0], gpu_indexes[0]);
+  // int num_blocks = 0, num_threads = 0;
+  // getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  // dim3 grid(num_blocks);
+  // dim3 threads(num_threads);
+  // extract<Torus><<<grid, threads, 0, streams[0]>>>(glwe_array_out, array_in,
+  //                                                  glwe_index, log_modulus,
+  //                                                  input_len,
+  //                                                  initial_out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
 template <typename Torus>
 __host__ void host_integer_decompress(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -370,7 +448,7 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_compression<Torus>(
      streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
-      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -386,7 +464,7 @@ __host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
  *mem_ptr = new int_decompression<Torus>(
      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
      num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
-      size_tracker);
+      &size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -28,7 +28,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
                                           params, is_signed, num_blocks,
-                                           allocate_gpu_memory, size_tracker);
+                                           allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -268,11 +268,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // but in that position, interesting_remainder2 always has a 0
    auto merged_interesting_remainder = interesting_remainder1;

-    host_addition<Torus>(
-        streams[0], gpu_indexes[0], merged_interesting_remainder,
-        merged_interesting_remainder, interesting_remainder2,
-        merged_interesting_remainder->num_radix_blocks,
-        radix_params.message_modulus, radix_params.carry_modulus);
+    host_addition<Torus>(streams[0], gpu_indexes[0],
+                         merged_interesting_remainder,
+                         merged_interesting_remainder, interesting_remainder2,
+                         merged_interesting_remainder->num_radix_blocks);

    // after create_clean_version_of_merged_remainder
    // `merged_interesting_remainder` will be reused as
@@ -383,10 +382,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    host_addition<Torus>(
-        streams[0], gpu_indexes[0], overflow_sum, subtraction_overflowed,
-        at_least_one_upper_block_is_non_zero, 1, radix_params.message_modulus,
-        radix_params.carry_modulus);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum,
+                         subtraction_overflowed,
+                         at_least_one_upper_block_is_non_zero, 1);

    auto message_modulus = radix_params.message_modulus;
    int factor = (i) ? message_modulus - 1 : message_modulus - 2;
@@ -436,9 +434,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      as_radix_ciphertext_slice<Torus>(&quotient_block, quotient, block_of_bit,
                                       block_of_bit + 1);
      host_addition<Torus>(streams[0], gpu_indexes[0], &quotient_block,
-                           &quotient_block, mem_ptr->did_not_overflow, 1,
-                           radix_params.message_modulus,
-                           radix_params.carry_modulus);
+                           &quotient_block, mem_ptr->did_not_overflow, 1);
    };

    for (uint j = 0; j < gpu_count; j++) {
@@ -481,9 +477,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1,
-                       remainder2, remainder1->num_radix_blocks,
-                       radix_params.message_modulus,
-                       radix_params.carry_modulus);
+                       remainder2, remainder1->num_radix_blocks);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -398,7 +398,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
  *mem_ptr = new int_noise_squashing_lut<uint64_t>(
      (cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
      polynomial_size, num_radix_blocks, original_num_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -409,8 +409,7 @@ __host__ void host_pack_bivariate_blocks(
    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
    Torus const *lwe_indexes_out, CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, Torus const *lwe_indexes_in,
-    uint32_t shift, uint32_t num_radix_blocks, uint32_t const message_modulus,
-    uint32_t const carry_modulus) {
+    uint32_t shift, uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -434,15 +433,6 @@ __host__ void host_pack_bivariate_blocks(
          (Torus *)lwe_array_1->ptr, (Torus *)lwe_array_2->ptr, lwe_indexes_in,
          lwe_dimension, shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());
-
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    lwe_array_out->degrees[i] =
-        lwe_array_1->degrees[i] * shift + lwe_array_2->degrees[i];
-    lwe_array_out->noise_levels[i] =
-        lwe_array_1->noise_levels[i] * shift + lwe_array_2->noise_levels[i];
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
-                      carry_modulus);
-  }
 }

 // polynomial_size threads
@@ -531,7 +521,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  if (num_radix_blocks > lut->num_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of lut radix blocks")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks)
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
+      num_radix_blocks > lwe_array_in->num_radix_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of input & output radix blocks")

@@ -607,8 +598,6 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
    auto degrees_index = lut->h_lut_indexes[i];
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
@@ -714,8 +703,6 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
    auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
@@ -764,7 +751,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  host_pack_bivariate_blocks<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
      lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
-      shift, num_radix_blocks, params.message_modulus, params.carry_modulus);
+      shift, num_radix_blocks);
  check_cuda_error(cudaGetLastError());

  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -832,8 +819,6 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    auto degrees_index = lut->h_lut_indexes[i];
    lwe_array_out->degrees[i] = lut->degrees[degrees_index];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
@@ -1462,8 +1447,6 @@ void host_full_propagate_inplace(
    auto degrees_index = mem_ptr->lut->h_lut_indexes[0];
    input_blocks->degrees[i] = mem_ptr->lut->degrees[degrees_index];
    input_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(input_blocks->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);

    if (i < num_blocks - 1) {
      CudaRadixCiphertextFFI next_input_block;
@@ -1474,8 +1457,7 @@ void host_full_propagate_inplace(
                                       mem_ptr->tmp_big_lwe_vector, 1, 2);

      host_addition<Torus>(streams[0], gpu_indexes[0], &next_input_block,
-                           &next_input_block, &second_input, 1,
-                           params.message_modulus, params.carry_modulus);
+                           &next_input_block, &second_input, 1);
    }
  }
 }
@@ -1491,7 +1473,7 @@ uint64_t scratch_cuda_full_propagation(cudaStream_t const *streams,
  uint64_t size_tracker = 0;
  *mem_ptr =
      new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
-                                     allocate_gpu_memory, size_tracker);
+                                     allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -1726,7 +1708,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory,
-                                      size_tracker);
+                                      &size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1762,7 +1744,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, num_many_lut,
-                                      allocate_gpu_memory, size_tracker);
+                                      allocate_gpu_memory, &size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1798,7 +1780,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory,
-                                      size_tracker);
+                                      &size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1836,7 +1818,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_sc_prop_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
-      uses_carry, allocate_gpu_memory, size_tracker);
+      uses_carry, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }
 // This function perform the three steps of Thomas' new carry propagation
@@ -1853,6 +1835,9 @@ void host_propagate_single_carry(
  PUSH_RANGE("propagate sc")
  auto num_radix_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto lut_stride = mem->lut_stride;
  auto num_many_lut = mem->num_many_lut;
  CudaRadixCiphertextFFI output_flag;
@@ -1866,10 +1851,8 @@ void host_propagate_single_carry(
          "pointer")
  if (uses_carry == 1) {
    host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
-                         input_carries, 1, params.message_modulus,
-                         params.carry_modulus);
+                         input_carries, 1);
  }
-
  // Step 1
  host_compute_shifted_blocks_and_states<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
@@ -1893,8 +1876,7 @@ void host_propagate_single_carry(
  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
  host_addition<Torus>(
      streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
-      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
-      params.message_modulus, params.carry_modulus);
+      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
@@ -1903,8 +1885,7 @@ void host_propagate_single_carry(
        &shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
        num_radix_blocks - 1, num_radix_blocks);
    host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
-                         &shifted_simulators, 1, params.message_modulus,
-                         params.carry_modulus);
+                         &shifted_simulators, 1);
  }

  host_radix_sum_in_groups<Torus>(
@@ -1918,8 +1899,7 @@ void host_propagate_single_carry(
        mem->prop_simu_group_carries_mem->resolved_carries, mem->num_groups - 1,
        mem->num_groups);
    host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
-                         &shifted_resolved_carries, 1, params.message_modulus,
-                         params.carry_modulus);
+                         &shifted_resolved_carries, 1);

    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
@@ -1958,26 +1938,15 @@ void host_add_and_propagate_single_carry(
  PUSH_RANGE("add & propagate sc")
  if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
-
-  // Check input carries if used
-  if (uses_carry == 1) {
-    if (input_carries == nullptr)
-      PANIC("Cuda error: if uses_carry is enabled, input_carries cannot be a "
-            "null pointer");
-    if (lhs_array->lwe_dimension != input_carries->lwe_dimension)
-      PANIC(
-          "Cuda error: input and input_carries lwe dimension must be the same");
-  }
-
-  // Allow nullptr for carry_out if FLAG_NONE is requested
+  if (lhs_array->lwe_dimension != rhs_array->lwe_dimension ||
+      lhs_array->lwe_dimension != input_carries->lwe_dimension ||
+      lhs_array->lwe_dimension != carry_out->lwe_dimension)
+    PANIC("Cuda error: input and output lwe dimension must be the same")
  if ((requested_flag == outputFlag::FLAG_OVERFLOW ||
-       requested_flag == outputFlag::FLAG_CARRY)) {
-    if (carry_out == nullptr)
-      PANIC("Cuda error: when requesting FLAG_CARRY or FLAG_OVERFLOW, "
-            "carry_out must be a valid pointer")
-    if (lhs_array->lwe_dimension != carry_out->lwe_dimension)
-      PANIC("Cuda error: input and carry_out lwe dimension must be the same")
-  }
+       requested_flag == outputFlag::FLAG_CARRY) &&
+      carry_out == nullptr)
+    PANIC("Cuda error: when requesting FLAG_CARRY, carry_out must be a valid "
+          "pointer")

  auto num_radix_blocks = lhs_array->num_radix_blocks;
  auto params = mem->params;
@@ -2000,13 +1969,11 @@ void host_add_and_propagate_single_carry(
  }

  host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
-                       rhs_array, num_radix_blocks, params.message_modulus,
-                       params.carry_modulus);
+                       rhs_array, num_radix_blocks);

  if (uses_carry == 1) {
    host_addition<Torus>(streams[0], gpu_indexes[0], lhs_array, lhs_array,
-                         input_carries, 1, params.message_modulus,
-                         params.carry_modulus);
+                         input_carries, 1);
  }
  // Step 1
  host_compute_shifted_blocks_and_states<Torus>(
@@ -2037,8 +2004,7 @@ void host_add_and_propagate_single_carry(
  auto shifted_blocks = mem->shifted_blocks_state_mem->shifted_blocks;
  host_addition<Torus>(
      streams[0], gpu_indexes[0], prepared_blocks, shifted_blocks,
-      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks,
-      params.message_modulus, params.carry_modulus);
+      mem->prop_simu_group_carries_mem->simulators, num_radix_blocks);

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
@@ -2047,8 +2013,7 @@ void host_add_and_propagate_single_carry(
        &shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
        num_radix_blocks - 1, num_radix_blocks);
    host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag, &output_flag,
-                         &shifted_simulators, 1, params.message_modulus,
-                         params.carry_modulus);
+                         &shifted_simulators, 1);
  }

  // Step 3
@@ -2063,8 +2028,7 @@ void host_add_and_propagate_single_carry(
    if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
        uses_carry == 1) {
      host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
-                           &output_flag, input_carries, 1,
-                           params.message_modulus, params.carry_modulus);
+                           &output_flag, input_carries, 1);

    } else {
      CudaRadixCiphertextFFI shifted_resolved_carries;
@@ -2073,8 +2037,7 @@ void host_add_and_propagate_single_carry(
          mem->prop_simu_group_carries_mem->resolved_carries,
          mem->num_groups - 1, mem->num_groups);
      host_addition<Torus>(streams[0], gpu_indexes[0], &output_flag,
-                           &output_flag, &shifted_resolved_carries, 1,
-                           params.message_modulus, params.carry_modulus);
+                           &output_flag, &shifted_resolved_carries, 1);
    }
    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], prepared_blocks, num_radix_blocks,
@@ -2087,7 +2050,6 @@ void host_add_and_propagate_single_carry(
    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], lhs_array, 0, num_radix_blocks,
        mem->output_flag, 0, num_radix_blocks);
-
    copy_radix_ciphertext_slice_async<Torus>(
        streams[0], gpu_indexes[0], carry_out, 0, 1, mem->output_flag,
        num_radix_blocks, num_radix_blocks + 1);
@@ -2109,7 +2071,7 @@ uint64_t scratch_cuda_integer_overflowing_sub(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_borrow_prop_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      compute_overflow, allocate_gpu_memory, size_tracker);
+      compute_overflow, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -2179,8 +2141,7 @@ void host_single_borrow_propagate(
        &shifted_simulators, mem->prop_simu_group_carries_mem->simulators,
        num_radix_blocks - 1, num_radix_blocks);
    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block, &shifted_simulators, 1,
-                         params.message_modulus, params.carry_modulus);
+                         mem->overflow_block, &shifted_simulators, 1);
  }
  CudaRadixCiphertextFFI resolved_borrows;
  as_radix_ciphertext_slice<Torus>(
@@ -2192,8 +2153,7 @@ void host_single_borrow_propagate(
  //  borrows
  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
-                         mem->overflow_block, &resolved_borrows, 1,
-                         params.message_modulus, params.carry_modulus);
+                         mem->overflow_block, &resolved_borrows, 1);
  }

  cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
@@ -2343,8 +2303,6 @@ __host__ void integer_radix_apply_noise_squashing_kb(
  for (uint i = 0; i < lut->num_blocks; i++) {
    lwe_array_out->degrees[i] = lut->degrees[0];
    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], params.message_modulus,
-                      params.carry_modulus);
  }
  POP_RANGE()
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -210,8 +210,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
+    bool allocate_gpu_memory, bool allocate_ms_array) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -221,8 +220,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  return scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
-      max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
-      allocate_gpu_memory);
+      max_num_radix_in_vec, params, allocate_gpu_memory);
 }

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
@@ -236,11 +234,69 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
-  host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-      radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-      radix_lwe_out->num_radix_blocks,
-      radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
+  // FIXME: this should not be necessary, we should make sure sum_ctxt works in
+  // the general case
+  for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
+    radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
+  }
+  switch (mem->params.polynomial_size) {
+  case 512:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 1024:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<1024>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 2048:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<2048>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 4096:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<4096>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 8192:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<8192>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  case 16384:
+    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
+                                                AmortizedDegree<16384>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_out->num_radix_blocks,
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
+        nullptr);
+    break;
+  default:
+    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
+          "Supported N's are powers of two in the interval [256..16384].")
+  }
 }

 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -24,6 +24,24 @@
 #include <string>
 #include <vector>

+template <typename Torus>
+__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
+                           int32_t *id_in, size_t lwe_size) {
+  size_t tid = threadIdx.x;
+  size_t b_id = blockIdx.x;
+  size_t stride = blockDim.x;
+
+  auto input_id = id_in[b_id];
+  auto output_id = id_out[b_id];
+
+  auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
+  auto cur_dst = &dst[output_id * lwe_size];
+
+  for (int i = tid; i < lwe_size; i += stride) {
+    cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
+  }
+}
+
 template <typename Torus, class params>
 __global__ void
 all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -76,155 +94,33 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
-                                            uint32_t *const columns_counter,
-                                            const uint64_t *const degrees,
-                                            const uint32_t num_radix_blocks,
-                                            const uint32_t num_radix_in_vec) {
+template <typename Torus>
+__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
+                                uint32_t chunk_size, uint32_t block_size,
+                                uint32_t num_blocks) {

-  const uint32_t idx = threadIdx.x;
-  size_t cnt = 0;
-  for (int i = 0; i < num_radix_in_vec; i++) {
-    size_t ct_id = i * num_radix_blocks + idx;
-    if (degrees[ct_id] != 0) {
-      columns[idx][cnt] = ct_id;
-      ++cnt;
+  size_t stride = blockDim.x;
+  size_t chunk_id = blockIdx.x;
+  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
+  size_t radix_elem_size = num_blocks * block_size;
+  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
+  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
+  size_t block_stride = blockIdx.y * block_size;
+  auto result = &dst_radix[block_stride];
+
+  // init shared mem with first radix of chunk
+  size_t tid = threadIdx.x;
+  for (int i = tid; i < block_size; i += stride) {
+    result[i] = src_chunk[block_stride + i];
+  }
+
+  // accumulate rest  of the radixes
+  for (int r_id = 1; r_id < chunk_size; r_id++) {
+    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
+    for (int i = tid; i < block_size; i += stride) {
+      result[i] += cur_src_radix[block_stride + i];
    }
  }
-  columns_counter[idx] = cnt;
-}
-
-template <typename Torus>
-__global__ inline void prepare_new_columns_and_pbs_indexes(
-    uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
-    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
-    Torus *const lut_indexes, const uint32_t *const *const columns,
-    const uint32_t *const columns_counter, const uint32_t chunk_size) {
-  __shared__ uint32_t counter;
-
-  if (threadIdx.x == 0) {
-    counter = 0;
-  }
-  __syncthreads();
-
-  const uint32_t base_id = threadIdx.x;
-  const uint32_t column_len = columns_counter[base_id];
-
-  uint32_t ct_count = 0;
-  for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
-    // those indexes are for message ciphertexts
-    // for message ciphertexts in and out index should be same
-    const uint32_t in_index = columns[base_id][i];
-    new_columns[base_id][ct_count] = in_index;
-    const uint32_t pbs_index = atomicAdd(&counter, 1);
-    pbs_indexes_in[pbs_index] = in_index;
-    pbs_indexes_out[pbs_index] = in_index;
-    lut_indexes[pbs_index] = 0;
-    ++ct_count;
-  }
-  __syncthreads();
-
-  if (base_id > 0) {
-    const uint32_t prev_base_id = base_id - 1;
-    const uint32_t prev_column_len = columns_counter[prev_base_id];
-
-    for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
-      // those indexes are for carry ciphertexts
-      // for carry ciphertexts input is same as for message
-      // output will be placed to next block in the column
-      const uint32_t in_index = columns[prev_base_id][i];
-      const uint32_t out_index = columns[prev_base_id][i + 1];
-      new_columns[base_id][ct_count] = out_index;
-      const uint32_t pbs_index = atomicAdd(&counter, 1);
-      pbs_indexes_in[pbs_index] = in_index;
-      pbs_indexes_out[pbs_index] = out_index;
-      lut_indexes[pbs_index] = 1;
-      ++ct_count;
-    }
-  }
-
-  const uint32_t start_index = column_len - column_len % chunk_size;
-  for (uint32_t i = start_index; i < column_len; ++i) {
-    new_columns[base_id][ct_count] = columns[base_id][i];
-    ++ct_count;
-  }
-
-  new_columns_counter[base_id] = ct_count;
-}
-
-template <typename Torus>
-__global__ inline void prepare_final_pbs_indexes(
-    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
-    Torus *const lut_indexes, const uint32_t num_radix_blocks) {
-  int idx = threadIdx.x;
-  pbs_indexes_in[idx] = idx % num_radix_blocks;
-  pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
-  lut_indexes[idx] = idx / num_radix_blocks;
-}
-
-template <typename Torus>
-__global__ void calculate_chunks(Torus *const input_blocks,
-                                 const uint32_t *const *const columns,
-                                 const uint32_t *const columns_counter,
-                                 const uint32_t chunk_size,
-                                 const uint32_t block_size) {
-
-  const uint32_t part_size = blockDim.x;
-  const uint32_t base_id = blockIdx.x;
-  const uint32_t part_id = blockIdx.y;
-  const uint32_t coef_id = part_id * part_size + threadIdx.x;
-
-  if (coef_id >= block_size)
-    return;
-
-  const uint32_t column_len = columns_counter[base_id];
-
-  if (column_len >= chunk_size) {
-    const uint32_t num_chunks = column_len / chunk_size;
-    Torus result = 0;
-
-    for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
-      const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
-      result = input_blocks[first_ct_id * block_size + coef_id];
-
-      for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
-        const uint32_t cur_ct_id =
-            columns[base_id][chunk_id * chunk_size + ct_id];
-        result += input_blocks[cur_ct_id * block_size + coef_id];
-      }
-
-      input_blocks[first_ct_id * block_size + coef_id] = result;
-    }
-  }
-}
-
-template <typename Torus>
-__global__ void calculate_final_chunk_into_radix(
-    Torus *const out_radix, const Torus *const input_blocks,
-    const uint32_t *const *const columns, const uint32_t *const columns_counter,
-    const uint32_t chunk_size, const uint32_t block_size) {
-
-  const uint32_t part_size = blockDim.x;
-  const uint32_t base_id = blockIdx.x;
-  const uint32_t part_id = blockIdx.y;
-  const uint32_t coef_id = part_id * part_size + threadIdx.x;
-
-  if (coef_id >= block_size)
-    return;
-
-  const uint32_t column_len = columns_counter[base_id];
-
-  Torus result = 0;
-  if (column_len) {
-    const uint32_t first_ct_id = columns[base_id][0];
-    result = input_blocks[first_ct_id * block_size + coef_id];
-
-    for (uint32_t i = 1; i < column_len; ++i) {
-      const uint32_t cur_ct_it = columns[base_id][i];
-      result += input_blocks[cur_ct_it * block_size + coef_id];
-    }
-  }
-  out_radix[base_id * block_size + coef_id] = result;
 }

 template <typename Torus, class params>
@@ -271,33 +167,29 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
        (process_msb) ? cur_msb_ct[params::degree] : 0;
  }
 }
-
 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
-    bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
-    bool allocate_gpu_memory) {
+    int_radix_params params, bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
-      max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
-      allocate_gpu_memory, size_tracker);
+      max_num_radix_in_vec, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

-template <typename Torus>
+template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
-  auto big_lwe_size = big_lwe_dimension + 1;
+    uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
+    int_radix_lut<Torus> *reused_lut) {

  if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
    PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -307,235 +199,273 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    PANIC("Cuda error: input vector does not have enough blocks")
  if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
    PANIC("Cuda error: output does not have enough blocks")
-  if (num_radix_in_vec == 0)
-    return;
-
-  auto current_blocks = mem_ptr->current_blocks;
+  auto new_blocks = mem_ptr->new_blocks;
+  auto new_blocks_copy = mem_ptr->new_blocks_copy;
+  auto old_blocks = mem_ptr->old_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;
-  auto d_degrees = mem_ptr->d_degrees;
-  auto d_columns = mem_ptr->d_columns;
-  auto d_columns_counter = mem_ptr->d_columns_counter;
-  auto d_new_columns = mem_ptr->d_new_columns;
-  auto d_new_columns_counter = mem_ptr->d_new_columns_counter;

+  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
+  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
+
+  auto message_modulus = mem_ptr->params.message_modulus;
+  auto carry_modulus = mem_ptr->params.carry_modulus;
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto chunk_size =
-      (mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
-      (mem_ptr->params.message_modulus - 1);
-
-  size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
+  auto small_lwe_size = small_lwe_dimension + 1;

  // In the case of extracting a single LWE this parameters are dummy
  uint32_t num_many_lut = 1;
  uint32_t lut_stride = 0;

-  if (terms->num_radix_blocks == 0) {
+  if (terms->num_radix_blocks == 0)
    return;
-  }
  if (num_radix_in_vec == 1) {
    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
                                             radix_lwe_out, 0, num_radix_blocks,
                                             terms, 0, num_radix_blocks);
    return;
  }
-
+  if (old_blocks != terms) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
+                                       terms);
+  }
  if (num_radix_in_vec == 2) {
-    CudaRadixCiphertextFFI terms_slice;
-    as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
-                                     2 * num_radix_blocks);
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
-                         &terms_slice, num_radix_blocks,
-                         mem_ptr->params.message_modulus,
-                         mem_ptr->params.carry_modulus);
+    CudaRadixCiphertextFFI old_blocks_slice;
+    as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                     num_radix_blocks, 2 * num_radix_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks_slice, num_radix_blocks);
    return;
  }

-  if (current_blocks != terms) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                       current_blocks, terms);
+  size_t r = num_radix_in_vec;
+  size_t total_modulus = message_modulus * carry_modulus;
+  size_t message_max = message_modulus - 1;
+  size_t chunk_size = (total_modulus - 1) / message_max;
+
+  size_t h_lwe_idx_in[terms->num_radix_blocks];
+  size_t h_lwe_idx_out[terms->num_radix_blocks];
+  int32_t h_smart_copy_in[terms->num_radix_blocks];
+  int32_t h_smart_copy_out[terms->num_radix_blocks];
+
+  /// Here it is important to query the default max shared memory on device 0
+  /// instead of cuda_get_max_shared_memory,
+  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
+  auto max_shared_memory = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
+
+  // create lut object for message and carry
+  // we allocate luts_message_carry in the host function (instead of scratch)
+  // to reduce average memory consumption
+  int_radix_lut<Torus> *luts_message_carry;
+  size_t ch_amount = r / chunk_size;
+  if (!ch_amount)
+    ch_amount++;
+  if (reused_lut == nullptr) {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_radix_blocks, true, nullptr);
+  } else {
+    luts_message_carry = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
+        2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
  }
+  auto message_acc = luts_message_carry->get_lut(0, 0);
+  auto carry_acc = luts_message_carry->get_lut(0, 1);

-  cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
-                           total_blocks_in_vec * sizeof(uint64_t), streams[0],
-                           gpu_indexes[0]);
+  // define functions for each accumulator
+  auto lut_f_message = [message_modulus](Torus x) -> Torus {
+    return x % message_modulus;
+  };
+  auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+    return x / message_modulus;
+  };

-  cuda_set_device(gpu_indexes[0]);
-  radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
-      d_columns, d_columns_counter, d_degrees, num_radix_blocks,
-      num_radix_in_vec);
+  // generate accumulators
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], message_acc,
+      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
+      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+      lut_f_message, true);
+  generate_device_accumulator<Torus>(
+      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
+      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
+      message_modulus, carry_modulus, lut_f_carry, true);
+  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

-  bool needs_processing = false;
-  radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
-                                num_radix_in_vec, chunk_size, needs_processing);
-  int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
-  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
-  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
+  while (r > 2) {
+    size_t cur_total_blocks = r * num_radix_blocks;
+    size_t ch_amount = r / chunk_size;
+    if (!ch_amount)
+      ch_amount++;
+    dim3 add_grid(ch_amount, num_radix_blocks, 1);

-  mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count,
-                               num_radix_in_vec, current_blocks->degrees);
+    cuda_set_device(gpu_indexes[0]);
+    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
+        (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
+        std::min(r, chunk_size), big_lwe_size, num_radix_blocks);

-  while (needs_processing) {
-    auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
-    auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
-    calculate_chunks<Torus>
-        <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
-            (Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
-            chunk_size, big_lwe_size);
+    check_cuda_error(cudaGetLastError());

-    prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
-        d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
-        d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
-        d_columns_counter, chunk_size);
+    size_t total_count = 0;
+    size_t message_count = 0;
+    size_t carry_count = 0;
+    size_t sm_copy_count = 0;

-    uint32_t total_ciphertexts;
-    uint32_t total_messages;
-    current_columns.next_accumulation(total_ciphertexts, total_messages,
-                                      needs_processing);
+    generate_ids_update_degrees(
+        terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
+        h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
+        message_max, total_count, message_count, carry_count, sm_copy_count);
+    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
+    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
+    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
+                                        h_lwe_idx_in, h_lwe_idx_out);

-    auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
+    size_t copy_size = sm_copy_count * sizeof(int32_t);
+    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
+                             streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
+                             streams[0], gpu_indexes[0]);
+
+    // inside d_smart_copy_in there are only -1 values
+    // it's fine to call smart_copy with same pointer
+    // as source and destination
+    copy_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
+        new_blocks, 0, r * num_radix_blocks);
+    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
+        (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
+        d_smart_copy_out, d_smart_copy_in, big_lwe_size);
+    check_cuda_error(cudaGetLastError());
+
+    if (carry_count > 0)
+      cuda_set_value_async<Torus>(
+          streams[0], gpu_indexes[0],
+          luts_message_carry->get_lut_indexes(0, message_count), 1,
+          carry_count);
+
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
+    std::vector<Torus *> small_lwe_vector_vec =
+        luts_message_carry->lwe_after_ks_vec;
+    std::vector<Torus *> lwe_after_pbs_vec =
+        luts_message_carry->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec =
+        luts_message_carry->lwe_trivial_indexes_vec;
+
+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
    if (active_gpu_count == 1) {
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
      execute_keyswitch_async<Torus>(
          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
+          lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
+          polynomial_size * glwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
+          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
+          (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
+          ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
+          small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
+          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
+    } else {
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, new_blocks_vec,
+          (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          big_lwe_size);
+
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
          ksks, big_lwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
-          total_messages);
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);

+      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
+      /// different configuration
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
+          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          small_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          small_lwe_size);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
-          d_pbs_indexes_out, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
+          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
          polynomial_size, mem_ptr->params.pbs_base_log,
          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
-          lut_stride);
-    } else {
-      Torus *h_lwe_indexes_in_pinned;
-      Torus *h_lwe_indexes_out_pinned;
-      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
-                     total_ciphertexts * sizeof(Torus));
-      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
-                     total_ciphertexts * sizeof(Torus));
-      for (uint32_t i = 0; i < total_ciphertexts; i++) {
-        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
-        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
-      }
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
-          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
-          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      for (uint32_t i = 0; i < total_ciphertexts; i++) {
-        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
-        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
-      }
-      cudaFreeHost(h_lwe_indexes_in_pinned);
-      cudaFreeHost(h_lwe_indexes_out_pinned);
+          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);

-      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
-      luts_message_carry->using_trivial_lwe_indexes = false;
-
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, gpu_count, current_blocks, current_blocks, bsks,
-          ksks, ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
+          lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          big_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
    }
-    cuda_set_device(gpu_indexes[0]);
-    std::swap(d_columns, d_new_columns);
-    std::swap(d_columns_counter, d_new_columns_counter);
-  }
-
-  calculate_final_chunk_into_radix<Torus>
-      <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
-          (Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
-          d_columns, d_columns_counter, chunk_size, big_lwe_size);
-
-  if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
-    auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
-    auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
-    prepare_final_pbs_indexes<Torus>
-        <<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
-            d_pbs_indexes_in, d_pbs_indexes_out,
-            luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
-
-    set_zero_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
-        num_radix_blocks + 1);
-
-    auto active_gpu_count =
-        get_active_gpu_count(2 * num_radix_blocks, gpu_count);
-
-    if (active_gpu_count == 1) {
-      execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
-          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
-          mem_ptr->params.ks_level, num_radix_blocks);
-
-      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
-          d_pbs_indexes_out, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
-          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
-          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
-          polynomial_size, mem_ptr->params.pbs_base_log,
-          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
-          lut_stride);
-    } else {
-      uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
-      Torus *h_lwe_indexes_in_pinned;
-      Torus *h_lwe_indexes_out_pinned;
-      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
-                     num_blocks_in_apply_lut * sizeof(Torus));
-      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
-                     num_blocks_in_apply_lut * sizeof(Torus));
-      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
-        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
-        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
-      }
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
-          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_memcpy_async_to_cpu(
-          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
-          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
-        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
-        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
-      }
-      cudaFreeHost(h_lwe_indexes_in_pinned);
-      cudaFreeHost(h_lwe_indexes_out_pinned);
-
-      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
-      luts_message_carry->using_trivial_lwe_indexes = false;
-
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
-          bsks, ksks, ms_noise_reduction_key, luts_message_carry,
-          num_blocks_in_apply_lut);
+    for (uint i = 0; i < total_count; i++) {
+      auto degrees_index = luts_message_carry->h_lut_indexes[i];
+      new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
+      new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
    }
-    calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
-                            num_radix_blocks, num_radix_in_vec, chunk_size,
-                            mem_ptr->params.message_modulus);
-    cuda_set_device(gpu_indexes[0]);
-    CudaRadixCiphertextFFI current_blocks_slice;
-    as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
-                                     num_radix_blocks, 2 * num_radix_blocks);

-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
-                         current_blocks, &current_blocks_slice,
-                         num_radix_blocks, mem_ptr->params.message_modulus,
-                         mem_ptr->params.carry_modulus);
+    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
+    int new_blocks_created = 2 * ch_amount * num_radix_blocks;
+
+    if (rem_blocks > 0)
+      copy_radix_ciphertext_slice_async<Torus>(
+          streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
+          new_blocks_created + rem_blocks, old_blocks,
+          cur_total_blocks - rem_blocks, cur_total_blocks);
+    std::swap(new_blocks, old_blocks);
+    r = (new_blocks_created + rem_blocks) / num_radix_blocks;
  }
+  luts_message_carry->release(streams, gpu_indexes, gpu_count);
+  delete (luts_message_carry);
+
+  CudaRadixCiphertextFFI old_blocks_slice;
+  as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
+                                   num_radix_blocks, 2 * num_radix_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks_slice, num_radix_blocks);
 }

 template <typename Torus, class params>
@@ -669,10 +599,10 @@ __host__ void host_integer_mult_radix_kb(
    size_t b_id = i % num_blocks;
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }
-  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
+  host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
      ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
-      2 * num_blocks);
+      2 * num_blocks, mem_ptr->luts_array);

  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -693,7 +623,7 @@ __host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_mul_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, is_boolean_left,
-      is_boolean_right, num_radix_blocks, allocate_gpu_memory, size_tracker);
+      is_boolean_right, num_radix_blocks, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -106,8 +106,6 @@ __host__ void host_integer_radix_negation(

    lwe_array_out->degrees[i] = z - static_cast<uint64_t>(zb);
    lwe_array_out->noise_levels[i] = lwe_array_in->noise_levels[i];
-    CHECK_NOISE_LEVEL(lwe_array_out->noise_levels[i], message_modulus,
-                      carry_modulus);
    zb = z / message_modulus;
  }
 }
@@ -123,7 +121,7 @@ __host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_overflowing_sub_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
-      allocate_ms_array, size_tracker);
+      allocate_ms_array, &size_tracker);
  POP_RANGE()
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -13,7 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                        CudaRadixCiphertextFFI *radix,
                                        const uint32_t num_radix_blocks,
                                        const uint32_t lwe_dimension,
-                                        uint64_t &size_tracker,
+                                        uint64_t *size_tracker,
                                        bool allocate_gpu_memory) {
  PUSH_RANGE("create zero radix ct");
  radix->lwe_dimension = lwe_dimension;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -34,7 +34,7 @@ void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
    auto result = max;

    for (uint j = 0; j < min + 1; j++) {
-      if ((max | j) > result) {
+      if (max | j > result) {
        result = max | j;
      }
    }
@@ -52,7 +52,7 @@ void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,

    // Try every possibility to find the worst case
    for (uint j = 0; j < min + 1; j++) {
-      if ((max ^ j) > result) {
+      if (max ^ j > result) {
        result = max ^ j;
      }
    }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -277,9 +277,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        auto overflowed = x_0 < x_1;
        return (Torus)(invert_flags.second ^ overflowed);
      };
-      uint64_t size = 0;
      int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
@@ -579,9 +578,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
               is_x_less_than_y_given_input_borrow<Torus>(x_0, x_1, 0,
                                                          message_modulus);
      };
-      uint64_t size = 0;
      int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, true, size);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
@@ -1,202 +0,0 @@
-#include "scalar_div.cuh"
-
-uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory);
-}
-
-void cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi) {
-
-  host_integer_unsigned_scalar_div_radix<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
-      (int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      ms_noise_reduction_key, scalar_divisor_ffi);
-}
-
-void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
-      (int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory);
-}
-
-void cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
-
-  host_integer_signed_scalar_div_radix_kb<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
-      (int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
-}
-
-void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_signed_scalar_div_mem<uint64_t> *mem_ptr =
-      (int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
-}
-
-void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    void const *clear_blocks, void const *h_clear_blocks,
-    uint32_t num_clear_blocks) {
-
-  host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
-      remainder_ct, (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
-      bsks, (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
-      divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
-      (uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
-}
-
-void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
-      (int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
-    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
-
-  return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
-      (int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
-}
-
-void cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    uint32_t numerator_bits) {
-
-  host_integer_signed_scalar_div_rem_radix<uint64_t>(
-      (cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
-      remainder_ct, (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
-      divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
-      numerator_bits);
-}
-
-void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
-      (int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
@@ -1,415 +0,0 @@
-#ifndef SCALAR_DIV_CUH
-#define SCALAR_DIV_CUH
-
-#include "integer/integer_utilities.h"
-#include "integer/scalar_bitops.cuh"
-#include "integer/scalar_mul.cuh"
-#include "integer/scalar_shifts.cuh"
-#include "integer/subtraction.cuh"
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const int_radix_params params,
-    int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_unsigned_scalar_div_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
-    int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi) {
-
-  if (scalar_divisor_ffi->is_abs_divisor_one) {
-    return;
-  }
-
-  if (scalar_divisor_ffi->is_divisor_pow2) {
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct,
-        scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
-        bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
-    return;
-  }
-
-  if (scalar_divisor_ffi->divisor_has_more_bits_than_numerator) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                       mem_ptr->tmp_ffi);
-    return;
-  }
-
-  if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
-
-    if (scalar_divisor_ffi->shift_pre != (uint64_t)0) {
-      PANIC("shift_pre should be == 0");
-    }
-
-    if (scalar_divisor_ffi->shift_post == (uint32_t)0) {
-      PANIC("shift_post should be > 0");
-    }
-
-    CudaRadixCiphertextFFI *numerator_cpy = mem_ptr->tmp_ffi;
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                       numerator_cpy, numerator_ct);
-
-    host_integer_radix_scalar_mul_high_kb<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_cpy,
-        mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
-        scalar_divisor_ffi);
-
-    host_sub_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
-        nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
-        ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
-        numerator_ct->num_radix_blocks);
-
-    host_add_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
-        nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
-        FLAG_NONE, (uint32_t)0);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct,
-        scalar_divisor_ffi->shift_post - (uint32_t)1,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
-        numerator_ct->num_radix_blocks);
-
-    return;
-  }
-
-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, numerator_ct,
-      scalar_divisor_ffi->shift_pre, mem_ptr->logical_scalar_shift_mem, bsks,
-      ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
-
-  host_integer_radix_scalar_mul_high_kb<Torus>(
-      streams, gpu_indexes, gpu_count, numerator_ct,
-      mem_ptr->scalar_mul_high_mem, ksks, ms_noise_reduction_key, bsks,
-      scalar_divisor_ffi);
-
-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, numerator_ct,
-      scalar_divisor_ffi->shift_post, mem_ptr->logical_scalar_shift_mem, bsks,
-      ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
-}
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_radix_params params,
-    int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_signed_scalar_div_mem<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, allocate_gpu_memory, size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_signed_scalar_div_radix_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
-    int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
-
-  if (scalar_divisor_ffi->is_abs_divisor_one) {
-    if (scalar_divisor_ffi->is_divisor_negative) {
-      CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
-
-      host_integer_radix_negation<Torus>(
-          streams, gpu_indexes, gpu_count, tmp, numerator_ct,
-          mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
-          numerator_ct->num_radix_blocks);
-
-      copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                         numerator_ct, tmp);
-    }
-    return;
-  }
-
-  if (scalar_divisor_ffi->chosen_multiplier_has_more_bits_than_numerator) {
-    set_zero_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], numerator_ct, 0,
-        numerator_ct->num_radix_blocks);
-    return;
-  }
-
-  CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
-
-  if (scalar_divisor_ffi->is_divisor_pow2) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
-                                       numerator_ct);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp,
-        scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp,
-        numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
-        tmp->num_radix_blocks);
-
-    host_add_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
-        mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
-        (uint32_t)0);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp,
-        scalar_divisor_ffi->chosen_multiplier_num_bits,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-  } else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
-                                       numerator_ct);
-
-    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
-        ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
-                                       numerator_ct);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    host_sub_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
-        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
-        FLAG_NONE, (uint32_t)0);
-
-  } else {
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
-                                       numerator_ct);
-
-    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
-        ksks, scalar_divisor_ffi, ms_noise_reduction_key, bsks);
-
-    host_add_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
-        mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
-        (uint32_t)0);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, scalar_divisor_ffi->shift_post,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
-                                       numerator_ct);
-
-    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-        streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
-        ms_noise_reduction_key);
-
-    host_sub_and_propagate_single_carry<Torus>(
-        streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
-        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
-        FLAG_NONE, (uint32_t)0);
-  }
-
-  if (scalar_divisor_ffi->is_divisor_negative) {
-    host_integer_radix_negation<Torus>(
-        streams, gpu_indexes, gpu_count, numerator_ct, tmp,
-        mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
-        numerator_ct->num_radix_blocks);
-  } else {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                       tmp);
-  }
-}
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const int_radix_params params,
-    int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-  *mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
-      size_tracker);
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_unsigned_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
-    CudaRadixCiphertextFFI *remainder_ct,
-    int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    Torus const *clear_blocks, Torus const *h_clear_blocks,
-    uint32_t num_clear_blocks) {
-
-  auto numerator_ct = mem_ptr->numerator_ct;
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                     quotient_ct);
-
-  host_integer_unsigned_scalar_div_radix(
-      streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->unsigned_div_mem,
-      bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi);
-
-  if (scalar_divisor_ffi->is_divisor_pow2) {
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       numerator_ct);
-    host_integer_radix_scalar_bitop_kb(
-        streams, gpu_indexes, gpu_count, remainder_ct, remainder_ct,
-        clear_blocks, h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
-        bsks, ksks, ms_noise_reduction_key);
-
-  } else {
-    if (!scalar_divisor_ffi->is_divisor_zero) {
-      copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
-                                         remainder_ct, quotient_ct);
-
-      if (!scalar_divisor_ffi->is_abs_divisor_one &&
-          remainder_ct->num_radix_blocks != 0) {
-
-        host_integer_scalar_mul_radix<Torus>(
-            streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
-            divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
-            ms_noise_reduction_key, mem_ptr->params.message_modulus,
-            num_scalars_divisor);
-      }
-    }
-
-    host_sub_and_propagate_single_carry(
-        streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
-        nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
-        ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
-
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       numerator_ct);
-  }
-}
-
-template <typename Torus>
-__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, const int_radix_params params,
-    int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
-    uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
-
-  uint64_t size_tracker = 0;
-
-  *mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
-      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory,
-      size_tracker);
-
-  return size_tracker;
-}
-
-template <typename Torus>
-__host__ void host_integer_signed_scalar_div_rem_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
-    CudaRadixCiphertextFFI *remainder_ct,
-    int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint64_t const *divisor_has_at_least_one_set,
-    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
-    uint32_t numerator_bits) {
-
-  auto numerator_ct = mem_ptr->numerator_ct;
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
-                                     quotient_ct);
-
-  host_integer_signed_scalar_div_radix_kb(
-      streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->signed_div_mem,
-      bsks, ksks, ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
-
-  host_propagate_single_carry<Torus>(
-      streams, gpu_indexes, gpu_count, quotient_ct, nullptr, nullptr,
-      mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
-      (uint32_t)0);
-
-  if (!scalar_divisor_ffi->is_divisor_negative &&
-      scalar_divisor_ffi->is_divisor_pow2) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       quotient_ct);
-
-    host_integer_radix_logical_scalar_shift_kb_inplace(
-        streams, gpu_indexes, gpu_count, remainder_ct,
-        scalar_divisor_ffi->ilog2_divisor, mem_ptr->logical_scalar_shift_mem,
-        bsks, ksks, ms_noise_reduction_key, remainder_ct->num_radix_blocks);
-
-  } else if (!scalar_divisor_ffi->is_divisor_zero) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                       quotient_ct);
-
-    bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
-                          !scalar_divisor_ffi->is_divisor_negative;
-
-    if (!is_divisor_one && remainder_ct->num_radix_blocks != 0) {
-      host_integer_scalar_mul_radix<Torus>(
-          streams, gpu_indexes, gpu_count, remainder_ct, decomposed_divisor,
-          divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
-          ms_noise_reduction_key, mem_ptr->params.message_modulus,
-          num_scalars_divisor);
-    }
-  }
-
-  host_sub_and_propagate_single_carry(
-      streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
-      nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
-      ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
-
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
-                                     numerator_ct);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -6,8 +6,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
-    bool allocate_ms_array) {
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -18,7 +17,28 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
  return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
-      num_scalar_bits, allocate_gpu_memory);
+      allocate_gpu_memory);
+}
+
+uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool anticipated_buffer_drop, bool allocate_gpu_memory,
+    bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_cuda_integer_radix_scalar_mul_high_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_scalar_mul_high<uint64_t> **)mem_ptr, num_blocks, params,
+      anticipated_buffer_drop, allocate_gpu_memory);
 }

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
@@ -29,11 +49,73 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {

-  host_integer_scalar_mul_radix<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-      decomposed_scalar, has_at_least_one_set,
-      reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-      (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+  switch (polynomial_size) {
+  case 512:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 1024:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 2048:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 4096:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 8192:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  case 16384:
+    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
+        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+        decomposed_scalar, has_at_least_one_set,
+        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+        num_scalars);
+    break;
+  default:
+    PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
+          "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
+  }
+}
+
+void cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
+    uint64_t rhs, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars) {
+
+  host_integer_radix_scalar_mul_high_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, ct,
+      (int_scalar_mul_high<uint64_t> *)mem_ptr, (uint64_t **)ksks, rhs,
+      decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
      num_scalars);
 }

@@ -47,3 +129,22 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,

  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_scalar_mul_high<uint64_t> *mem_ptr =
+      (int_scalar_mul_high<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
+}
+
+void cuda_small_scalar_multiplication_integer_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lwe_array, uint64_t scalar) {
+
+  host_integer_small_scalar_mul_radix<uint64_t>((cudaStream_t *)(streams),
+                                                gpu_indexes, gpu_count,
+                                                lwe_array, lwe_array, scalar);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -33,16 +33,16 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
-    uint32_t num_scalar_bits, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_scalar_mul_buffer<T>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      num_scalar_bits, allocate_gpu_memory, true, size_tracker);
+      allocate_gpu_memory, true, &size_tracker);
  return size_tracker;
 }

-template <typename T>
+template <typename T, class params>
 __host__ void host_integer_scalar_mul_radix(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
@@ -80,7 +80,7 @@ __host__ void host_integer_scalar_mul_radix(
    }
  }
  size_t j = 0;
-  for (size_t i = 0; i < std::min(num_scalars, num_ciphertext_bits); i++) {
+  for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) {
    if (decomposed_scalar[i] == 1) {
      // Perform a block shift
      CudaRadixCiphertextFFI preshifted_radix_ct;
@@ -116,10 +116,13 @@ __host__ void host_integer_scalar_mul_radix(
    set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
                                             lwe_array, 0, num_radix_blocks);
  } else {
-    host_integer_partial_sum_ciphertexts_vec_kb<T>(
+    for (int i = 0; i < j * num_radix_blocks; i++) {
+      all_shifted_buffer->degrees[i] = message_modulus - 1;
+    }
+    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
        ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j);
+        num_radix_blocks, j, nullptr);

    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -136,8 +139,7 @@ template <typename T>
 __host__ void host_integer_small_scalar_mul_radix(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
-    CudaRadixCiphertextFFI *input_lwe_array, T scalar,
-    const uint32_t message_modulus, const uint32_t carry_modulus) {
+    CudaRadixCiphertextFFI *input_lwe_array, T scalar) {

  if (output_lwe_array->num_radix_blocks != input_lwe_array->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -167,20 +169,35 @@ __host__ void host_integer_small_scalar_mul_radix(
    output_lwe_array->noise_levels[i] =
        input_lwe_array->noise_levels[i] * scalar;
    output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
-    CHECK_NOISE_LEVEL(output_lwe_array->noise_levels[i], message_modulus,
-                      carry_modulus);
  }
 }

+template <typename Torus>
+__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_scalar_mul_high<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params,
+    bool anticipated_buffer_drop, bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_scalar_mul_high<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      allocate_gpu_memory, LEFT_SHIFT, anticipated_buffer_drop, &size_tracker);
+
+  return size_tracker;
+}
+
 template <typename Torus>
 __host__ void host_integer_radix_scalar_mul_high_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
-    int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
+    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
+    void *const *bsks, uint32_t num_scalars) {

-  if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
+  if (rhs == (uint64_t)0) {
    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
                                                 0, ct->num_radix_blocks);
    return;
@@ -191,71 +208,66 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
  host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
                                                        gpu_indexes);

-  if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
-      !scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
-      tmp_ffi->num_radix_blocks != 0) {
+  if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
+    if ((rhs & (rhs - 1)) == 0) {
+
+      uint32_t shift = std::log2(rhs);

-    if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->ilog2_chosen_multiplier,
+          streams, gpu_indexes, gpu_count, tmp_ffi, shift,
          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);

    } else {

-      host_integer_scalar_mul_radix<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->decomposed_chosen_multiplier,
-          scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
-          mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
-          ms_noise_reduction_key, mem_ptr->params.message_modulus,
-          scalar_divisor_ffi->num_scalars);
-    }
-  }
-
-  host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_signed_scalar_mul_high_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
-    int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *const *bsks) {
-
-  if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
-    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
-                                                 0, ct->num_radix_blocks);
-    return;
-  }
-
-  CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
-
-  host_extend_radix_with_sign_msb<Torus>(
-      streams, gpu_indexes, gpu_count, tmp_ffi, ct, mem_ptr->extend_radix_mem,
-      ct->num_radix_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
-
-  if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
-      !scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
-      tmp_ffi->num_radix_blocks != 0) {
-
-    if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
-      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->ilog2_chosen_multiplier,
-          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
-          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
-    } else {
-      host_integer_scalar_mul_radix<Torus>(
-          streams, gpu_indexes, gpu_count, tmp_ffi,
-          scalar_divisor_ffi->decomposed_chosen_multiplier,
-          scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
-          mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
-          ms_noise_reduction_key, mem_ptr->params.message_modulus,
-          scalar_divisor_ffi->num_scalars);
+      switch (mem_ptr->params.polynomial_size) {
+      case 512:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 1024:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 2048:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 4096:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 8192:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 16384:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      default:
+        PANIC(
+            "Cuda error (scalar multiplication): unsupported polynomial size. "
+            "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
+      }
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -133,7 +133,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_shift_and_rotate_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
-      num_radix_blocks, allocate_gpu_memory, size_tracker);
+      num_radix_blocks, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

@@ -159,13 +159,11 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // control_bit|b|a
    host_pack_bivariate_blocks<Torus>(
        streams, gpu_indexes, gpu_count, mux_inputs, mux_lut->lwe_indexes_out,
-        rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
-        mem->params.message_modulus, mem->params.carry_modulus);
+        rotated_input, input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits);

    // The shift bit is already properly aligned/positioned
    host_add_the_same_block_to_all_blocks<Torus>(
-        streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit,
-        mem->params.message_modulus, mem->params.carry_modulus);
+        streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit);

    // we have
    // control_bit|b|a
@@ -185,9 +183,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

  // Bitshift and add the other bits
  for (int i = bits_per_block - 2; i >= 0; i--) {
-    host_integer_small_scalar_mul_radix<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array, lwe_array, 2,
-        mem->params.message_modulus, mem->params.carry_modulus);
+    host_integer_small_scalar_mul_radix<Torus>(streams, gpu_indexes, gpu_count,
+                                               lwe_array, lwe_array, 2);
    for (int j = 0; j < num_radix_blocks; j++) {
      CudaRadixCiphertextFFI block;
      CudaRadixCiphertextFFI bit_to_add;
@@ -196,8 +193,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
                                       i + j * bits_per_block,
                                       i + j * bits_per_block + 1);
      host_addition<Torus>(streams[0], gpu_indexes[0], &block, &block,
-                           &bit_to_add, 1, mem->params.message_modulus,
-                           mem->params.carry_modulus);
+                           &bit_to_add, 1);
    }

    // To give back a clean ciphertext
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -23,7 +23,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry(

  *mem_ptr = new int_sub_and_propagate<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);

  return size_tracker;
 }
@@ -73,7 +73,6 @@ __host__ void host_integer_radix_subtraction(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in_2,
      message_modulus, carry_modulus, num_radix_blocks);
  host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_out,
-                       lwe_array_in_1, num_radix_blocks, message_modulus,
-                       carry_modulus);
+                       lwe_array_in_1, num_radix_blocks);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -10,7 +10,7 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
      output->num_radix_blocks != input_2->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
-                          input_1, input_2, output->num_radix_blocks, 0, 0);
+                          input_1, input_2, output->num_radix_blocks);
 }

 /*
@@ -48,7 +48,7 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
      output->num_radix_blocks != input_2->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
-                          input_1, input_2, output->num_radix_blocks, 0, 0);
+                          input_1, input_2, output->num_radix_blocks);
 }

 /*
@@ -147,3 +147,44 @@ void cuda_add_lwe_ciphertext_vector_plaintext_64(
      static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
      input_lwe_dimension, input_lwe_ciphertext_count);
 }
+
+/*
+ * Perform the subtraction of a u64 input LWE ciphertext vector with a u64 input
+ * plaintext vector.
+ * - `stream` is a void pointer to the Cuda stream to be used in the kernel
+ * launch
+ * - `gpu_index` is the index of the GPU to be used in the kernel launch
+ * - `lwe_array_out` is an array of size
+ * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
+ * been allocated on the GPU before calling this function, and that will hold
+ * the result of the computation.
+ * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
+ * been allocated and initialized before calling this function. It has the same
+ * size as the output array.
+ * - `plaintext_array_in` is the plaintext vector used as input, it should have
+ * been allocated and initialized before calling this function. It should be of
+ * size `input_lwe_ciphertext_count`.
+ * - `input_lwe_dimension` is the number of mask elements in the input and
+ * output LWE ciphertext vectors
+ * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
+ * input LWE ciphertext vector, as well as in the output. It is also the number
+ * of plaintexts in the input plaintext vector.
+ *
+ * Each plaintext of the input plaintext vector is subtracted to the body of the
+ * corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
+ * operation is stored in the output LWE ciphertext vector. The two input
+ * vectors are unchanged. This function is a wrapper to a device function that
+ * performs the operation on the GPU.
+ */
+void cuda_sub_lwe_ciphertext_vector_plaintext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
+    void const *plaintext_array_in, uint32_t input_lwe_dimension,
+    uint32_t input_lwe_ciphertext_count) {
+
+  host_subtraction_plaintext<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -9,7 +9,6 @@
 #include "device.h"
 #include "helper_multi_gpu.h"
 #include "integer/integer.h"
-#include "integer/integer_utilities.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 #include <stdio.h>
@@ -103,12 +102,11 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
 // Coefficient-wise addition
 // num_radix_blocks selects the amount of blocks to be added from the inputs
 template <typename T>
-__host__ void
-host_addition(cudaStream_t stream, uint32_t gpu_index,
-              CudaRadixCiphertextFFI *output,
-              CudaRadixCiphertextFFI const *input_1,
-              CudaRadixCiphertextFFI const *input_2, uint32_t num_radix_blocks,
-              const uint32_t message_modulus, const uint32_t carry_modulus) {
+__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
+                            CudaRadixCiphertextFFI *output,
+                            CudaRadixCiphertextFFI const *input_1,
+                            CudaRadixCiphertextFFI const *input_2,
+                            uint32_t num_radix_blocks) {
  if (output->lwe_dimension != input_1->lwe_dimension ||
      output->lwe_dimension != input_2->lwe_dimension)
    PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -137,7 +135,6 @@ host_addition(cudaStream_t stream, uint32_t gpu_index,
    output->degrees[i] = input_1->degrees[i] + input_2->degrees[i];
    output->noise_levels[i] =
        input_1->noise_levels[i] + input_2->noise_levels[i];
-    CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
  }
 }

@@ -163,8 +160,7 @@ template <typename T>
 __host__ void host_add_the_same_block_to_all_blocks(
    cudaStream_t stream, uint32_t gpu_index, CudaRadixCiphertextFFI *output,
    CudaRadixCiphertextFFI const *input_with_multiple_blocks,
-    CudaRadixCiphertextFFI const *input_with_single_block,
-    const uint32_t message_modulus, const uint32_t carry_modulus) {
+    CudaRadixCiphertextFFI const *input_with_single_block) {
  if (output->num_radix_blocks != input_with_multiple_blocks->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
  if (input_with_single_block->num_radix_blocks != 1)
@@ -196,7 +192,6 @@ __host__ void host_add_the_same_block_to_all_blocks(
                         input_with_single_block->degrees[0];
    output->noise_levels[i] = input_with_multiple_blocks->noise_levels[i] +
                              input_with_single_block->noise_levels[0];
-    CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
  }
 }

@@ -276,7 +271,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
 }

 template <typename T>
-__global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
+__global__ void radix_body_subtraction_inplace(T *lwe_ct,
+                                               const T *plaintext_input,
                                               uint32_t input_lwe_dimension,
                                               uint32_t num_entries) {

@@ -293,7 +289,7 @@ __global__ void radix_body_subtraction_inplace(T *lwe_ct, T *plaintext_input,
 template <typename T>
 __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                         uint32_t gpu_index, T *output,
-                                         T *lwe_input, T *plaintext_input,
+                                         T *lwe_input, const T *plaintext_input,
                                         uint32_t input_lwe_dimension,
                                         uint32_t input_lwe_ciphertext_count) {

@@ -385,7 +381,6 @@ __host__ void host_unchecked_sub_with_correcting_term(
    output->noise_levels[i] =
        input_1->noise_levels[i] + input_2->noise_levels[i];
    zb = z / message_modulus;
-    CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -35,20 +35,6 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
                           static_cast<cudaStream_t>(stream), gpu_index);
 }

-void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
-    void *stream, uint32_t gpu_index, void *dest, void const *src,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size, uint32_t grouping_factor) {
-  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
-                               level_count * (1 << grouping_factor) /
-                               grouping_factor;
-  size_t buffer_size =
-      total_polynomials * polynomial_size * sizeof(__uint128_t);
-
-  cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
-                           static_cast<cudaStream_t>(stream), gpu_index);
-}
-
 // We need these lines so the compiler knows how to specialize these functions
 template __device__ const uint64_t *
 get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
@@ -94,14 +80,6 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
                                                    int glwe_dimension,
                                                    uint32_t level_count);

-template __device__ const __uint128_t *
-get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
-                                          int k, int level,
-                                          uint32_t grouping_factor,
-                                          uint32_t polynomial_size,
-                                          uint32_t glwe_dimension,
-                                          uint32_t level_count);
-
 template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
    const uint64_t *ptr, int g, int i, int k, int level,
    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -83,62 +83,6 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
  __syncthreads();
 }

-/** Perform the matrix multiplication between the GGSW and the GLWE,
- * each block operating on a single level for mask and body.
- * Both operands should be at fourier domain
- *
- * This function assumes:
- *  - Thread blocks at dimension z relates to the decomposition level.
- *  - Thread blocks at dimension y relates to the glwe dimension.
- *  - polynomial_size / params::opt threads are available per block
- */
-template <typename G, class params>
-__device__ void mul_ggsw_glwe_in_fourier_domain_128(
-    double *fft, double *join_buffer,
-    const double *__restrict__ bootstrapping_key, int iteration, G &group,
-    bool support_dsm = false) {
-  const uint32_t polynomial_size = params::degree;
-  const uint32_t glwe_dimension = gridDim.y - 1;
-  const uint32_t level_count = gridDim.z;
-
-  // The first product is used to initialize level_join_buffer
-  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-
-  // Continues multiplying fft by every polynomial in that particular bsk level
-  // Each y-block accumulates in a different polynomial at each iteration
-  auto bsk_slice = get_ith_mask_kth_block_128(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
-      glwe_dimension, level_count);
-  for (int j = 0; j < glwe_dimension + 1; j++) {
-    int idx = (j + this_block_rank) % (glwe_dimension + 1);
-
-    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
-    auto buffer_slice = get_join_buffer_element_128<G>(
-        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_product_accumulate_in_fourier_domain_128<params>(
-        buffer_slice, fft, bsk_poly, j == 0);
-    group.sync();
-  }
-
-  // -----------------------------------------------------------------
-  // All blocks are synchronized here; after this sync, level_join_buffer has
-  // the values needed from every other block
-
-  // accumulate rest of the products into fft buffer
-  for (int l = 0; l < level_count; l++) {
-    auto cur_src_acc = get_join_buffer_element_128<G>(
-        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
-                                                        l == 0);
-  }
-
-  __syncthreads();
-}
-
 template <typename Torus>
 void execute_pbs_async(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -279,7 +223,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
                         uint32_t level_count, uint32_t grouping_factor,
                         uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
                         bool allocate_gpu_memory, bool allocate_ms_array,
-                         uint64_t &size_tracker) {
+                         uint64_t *size_tracker) {
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
@@ -287,7 +231,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    case MULTI_BIT:
      PANIC("Error: 32-bit multibit PBS is not supported.\n")
    case CLASSICAL:
-      size_tracker = scratch_cuda_programmable_bootstrap_32(
+      *size_tracker = scratch_cuda_programmable_bootstrap_32(
          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, level_count, input_lwe_ciphertext_count,
          allocate_gpu_memory, allocate_ms_array);
@@ -302,12 +246,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    case MULTI_BIT:
      if (grouping_factor == 0)
        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
-      size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
+      *size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
      break;
    case CLASSICAL:
-      size_tracker = scratch_cuda_programmable_bootstrap_64(
+      *size_tracker = scratch_cuda_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, level_count, input_lwe_ciphertext_count,
          allocate_gpu_memory, allocate_ms_array);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
@@ -8,9 +8,9 @@ execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
                        uint32_t polynomial_size, uint32_t level_count,
                        uint32_t input_lwe_ciphertext_count,
                        bool allocate_gpu_memory, bool allocate_ms_array,
-                        uint64_t &size_tracker_on_gpu) {
+                        uint64_t *size_tracker_on_gpu) {
  // The squash noise function receives as input 64-bit integers
-  size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
+  *size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
      polynomial_size, level_count, input_lwe_ciphertext_count,
      allocate_gpu_memory, allocate_ms_array);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -286,7 +286,7 @@ __host__ uint64_t scratch_programmable_bootstrap_amortized(
          glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
          max_shared_memory);
  *pbs_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
-      buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
+      buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
  check_cuda_error(cudaGetLastError());
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -225,7 +225,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
-      allocate_gpu_memory, allocate_ms_array, size_tracker);
+      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -280,14 +280,13 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size =
-      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                        polynomial_size, full_sm_keybundle);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -386,7 +386,7 @@ __host__ uint64_t scratch_programmable_bootstrap(
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
-      allocate_gpu_memory, allocate_ms_array, size_tracker);
+      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -141,7 +141,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
    PANIC("Cuda error (classical PBS): base log should be <= 64")

  // If the parameters contain noise reduction key, then apply it
-  if (ms_noise_reduction_key->num_zeros != 0) {
+  if (buffer->uses_noise_reduction) {
    uint32_t log_modulus = log2(polynomial_size) + 1;
    host_improve_noise_modulus_switch<InputTorus>(
        static_cast<cudaStream_t>(stream), gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -18,6 +18,62 @@
 #include "programmable_bootstrap.cuh"
 #include "types/complex/operations.cuh"

+/** Perform the matrix multiplication between the GGSW and the GLWE,
+ * each block operating on a single level for mask and body.
+ * Both operands should be at fourier domain
+ *
+ * This function assumes:
+ *  - Thread blocks at dimension z relates to the decomposition level.
+ *  - Thread blocks at dimension y relates to the glwe dimension.
+ *  - polynomial_size / params::opt threads are available per block
+ */
+template <typename G, class params>
+__device__ void mul_ggsw_glwe_in_fourier_domain_128(
+    double *fft, double *join_buffer,
+    const double *__restrict__ bootstrapping_key, int iteration, G &group,
+    bool support_dsm = false) {
+  const uint32_t polynomial_size = params::degree;
+  const uint32_t glwe_dimension = gridDim.y - 1;
+  const uint32_t level_count = gridDim.z;
+
+  // The first product is used to initialize level_join_buffer
+  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  auto bsk_slice = get_ith_mask_kth_block_128(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
+      glwe_dimension, level_count);
+  for (int j = 0; j < glwe_dimension + 1; j++) {
+    int idx = (j + this_block_rank) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
+    auto buffer_slice = get_join_buffer_element_128<G>(
+        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_product_accumulate_in_fourier_domain_128<params>(
+        buffer_slice, fft, bsk_poly, j == 0);
+    group.sync();
+  }
+
+  // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block
+
+  // accumulate rest of the products into fft buffer
+  for (int l = 0; l < level_count; l++) {
+    auto cur_src_acc = get_join_buffer_element_128<G>(
+        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
+                                                        l == 0);
+  }
+
+  __syncthreads();
+}
+
 template <typename InputTorus, class params, sharedMemDegree SMD,
          bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
@@ -118,6 +174,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                               accumulator);
  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);

+  // We are using the same memory space for accumulator_fft and
+  // accumulator_rotated, so we need to synchronize here to make sure they
+  // don't modify the same memory space at the same time
  // Switch to the FFT space
  auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
  auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
@@ -396,7 +455,6 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {

-  cuda_set_device(gpu_index);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
          polynomial_size);
@@ -426,7 +484,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
-      allocate_gpu_memory, allocate_ms_array, size_tracker);
+      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

@@ -533,7 +591,7 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
-      allocate_gpu_memory, allocate_ms_array, size_tracker);
+      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -455,8 +455,11 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
 */
 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size,
-                            uint64_t full_sm_keybundle) {
+                            uint32_t polynomial_size) {
+
+  uint64_t full_sm_keybundle =
+      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
+          polynomial_size);

  int max_blocks_per_sm;
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -18,20 +18,6 @@
 #include "types/complex/operations.cuh"
 #include <vector>

-template <typename Torus, class params>
-__device__ uint32_t calculates_monomial_degree(const Torus *lwe_array_group,
-                                               uint32_t ggsw_idx,
-                                               uint32_t grouping_factor) {
-  Torus x = 0;
-  for (int i = 0; i < grouping_factor; i++) {
-    uint32_t mask_position = grouping_factor - (i + 1);
-    int selection_bit = (ggsw_idx >> mask_position) & 1;
-    x += selection_bit * lwe_array_group[i];
-  }
-
-  return modulus_switch(x, params::log2_degree + 1);
-}
-
 __device__ __forceinline__ int
 get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
                          uint32_t level_count) {
@@ -521,14 +507,13 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size =
-      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                        polynomial_size, full_sm_keybundle);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -1,361 +0,0 @@
-#include "programmable_bootstrap_cg_multibit.cuh"
-#include "programmable_bootstrap_multibit_128.cuh"
-
-template <typename InputTorus>
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
-    void *stream, uint32_t gpu_index,
-    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-
-  switch (polynomial_size) {
-  case 256:
-    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 512:
-    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 1024:
-    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 2048:
-    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 4096:
-    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                        AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
-          "N's are powers of two"
-          " in the interval [256..4096].")
-  }
-}
-
-template <typename InputTorus>
-uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
-    void *stream, uint32_t gpu_index,
-    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-
-  switch (polynomial_size) {
-  case 256:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 512:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 1024:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 2048:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  case 4096:
-    return scratch_cg_multi_bit_programmable_bootstrap_128<
-        InputTorus, AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
-        polynomial_size, level_count, input_lwe_ciphertext_count,
-        allocate_gpu_memory);
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
-          "N's are powers of two"
-          " in the interval [256..4096].")
-  }
-}
-
-uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
-    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
-
-  bool supports_cg =
-      supports_cooperative_groups_on_multibit_programmable_bootstrap<
-          __uint128_t>(glwe_dimension, polynomial_size, level_count,
-                       input_lwe_ciphertext_count,
-                       cuda_get_max_shared_memory(gpu_index));
-
-  if (supports_cg)
-    return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
-        stream, gpu_index,
-        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, allocate_gpu_memory);
-  else
-    return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
-        stream, gpu_index,
-        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, allocate_gpu_memory);
-}
-
-template <typename InputTorus>
-void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
-    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-
-  switch (polynomial_size) {
-  case 256:
-    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 512:
-    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 1024:
-    host_multi_bit_programmable_bootstrap_128<InputTorus,
-                                              AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 2048:
-    host_multi_bit_programmable_bootstrap_128<InputTorus,
-                                              AmortizedDegree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 4096:
-    host_multi_bit_programmable_bootstrap_128<InputTorus,
-                                              AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
-          "N's are powers of two"
-          " in the interval [256..4096].")
-  }
-}
-
-template <typename InputTorus>
-void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
-    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
-    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
-    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_many_lut, uint32_t lut_stride) {
-
-  switch (polynomial_size) {
-  case 256:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<256>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 512:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<512>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 1024:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<1024>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 2048:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<2048>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  case 4096:
-    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
-                                                 AmortizedDegree<4096>>(
-        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
-        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_many_lut, lut_stride);
-    break;
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
-          "N's are powers of two"
-          " in the interval [256..4096].")
-  }
-}
-
-void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_output_indexes, void const *lut_vector,
-    void const *lut_vector_indexes, void const *lwe_array_in,
-    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
-    uint32_t lut_stride) {
-
-  if (base_log > 64)
-    PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
-
-  auto *buffer =
-      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
-  switch (buffer->pbs_variant) {
-  case PBS_VARIANT::CG:
-    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
-        uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-                  static_cast<const uint64_t *>(lwe_output_indexes),
-                  static_cast<const __uint128_t *>(lut_vector),
-                  static_cast<const uint64_t *>(lut_vector_indexes),
-                  static_cast<const uint64_t *>(lwe_array_in),
-                  static_cast<const uint64_t *>(lwe_input_indexes),
-                  static_cast<const __uint128_t *>(bootstrapping_key), buffer,
-                  lwe_dimension, glwe_dimension, polynomial_size,
-                  grouping_factor, base_log, level_count, num_samples,
-                  num_many_lut, lut_stride);
-    break;
-  case PBS_VARIANT::DEFAULT:
-    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
-        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-        static_cast<const uint64_t *>(lwe_output_indexes),
-        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<const uint64_t *>(lut_vector_indexes),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(lwe_input_indexes),
-        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
-        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-        base_log, level_count, num_samples, num_many_lut, lut_stride);
-    break;
-  default:
-    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
-  }
-}
-
-void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
-                                                       const uint32_t gpu_index,
-                                                       int8_t **buffer) {
-  const auto x =
-      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
-  x->release(static_cast<cudaStream_t>(stream), gpu_index);
-}
-
-/**
- * Computes divisors of the product of num_sms (streaming multiprocessors on the
- * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
- * device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
- * root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
- * suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
- * - 13) for additional logic.
- *
- * The value 13 was empirically determined based on memory requirements for
- * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
- */
-template <typename Torus, class params>
-uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
-                                uint32_t polynomial_size,
-                                uint64_t full_sm_keybundle) {
-
-  int max_blocks_per_sm;
-  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  cuda_set_device(gpu_index);
-  if (max_shared_memory < full_sm_keybundle)
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_blocks_per_sm,
-        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
-                                                              NOSM>,
-        polynomial_size / params::opt, full_sm_keybundle);
-  else
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_blocks_per_sm,
-        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
-                                                              FULLSM>,
-        polynomial_size / params::opt, 0);
-
-  int num_sms = 0;
-  check_cuda_error(cudaDeviceGetAttribute(
-      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
-
-  int x = num_sms * max_blocks_per_sm;
-  int count = 0;
-
-  int divisor = 1;
-  int ith_divisor = 0;
-
-#if CUDA_ARCH < 900
-  // We pick a smaller divisor on GPUs other than H100, so 256-bit integer
-  // multiplication can run
-  int log2_max_num_pbs = log2_int(max_num_pbs);
-  if (log2_max_num_pbs > 13)
-    ith_divisor = log2_max_num_pbs - 11;
-#endif
-
-  for (int i = sqrt(x); i >= 1; i--) {
-    if (x % i == 0) {
-      if (count == ith_divisor) {
-        divisor = i;
-        break;
-      } else {
-        count++;
-      }
-    }
-  }
-
-  return divisor;
-}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -253,7 +253,7 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::TBC,
-      allocate_gpu_memory, allocate_ms_array, size_tracker);
+      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -283,14 +283,13 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size =
-      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                        polynomial_size, full_sm_keybundle);
+  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+      gpu_index, input_lwe_ciphertext_count, polynomial_size);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -5,15 +5,15 @@
 #include <stdio.h>
 #include <type_traits>

-template <typename T> __device__ inline const char *get_format();
+template <typename T> inline __device__ const char *get_format();

-template <> __device__ inline const char *get_format<int>() { return "%d, "; }
+template <> inline __device__ const char *get_format<int>() { return "%d, "; }

-template <> __device__ inline const char *get_format<unsigned int>() {
+template <> inline __device__ const char *get_format<unsigned int>() {
  return "%u, ";
 }

-template <> __device__ inline const char *get_format<uint64_t>() {
+template <> inline __device__ const char *get_format<uint64_t>() {
  return "%lu, ";
 }

@@ -23,15 +23,6 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
  }
 }

-template <>
-__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
-  for (int i = 0; i < N; i++) {
-    uint64_t low = static_cast<uint64_t>(src[i]);
-    uint64_t high = static_cast<uint64_t>(src[i] >> 64);
-    printf("(%llu, %llu), ", high, low);
-  }
-}
-
 template <>
 __global__ inline void print_debug_kernel(const double2 *src, int N) {
  for (int i = 0; i < N; i++) {
@@ -47,19 +38,17 @@ template <typename T> void print_debug(const char *name, const T *src, int N) {
 }

 template <typename T>
-__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
+__global__ void print_body_kernel(T *src, int N, int lwe_dimension) {
  for (int i = 0; i < N; i++) {
-    T body = src[i * (lwe_dimension + 1) + lwe_dimension];
-    T clear = body / delta;
-    printf("(%lu, %lu), ", body, clear);
+    printf("%lu, ", src[i * (lwe_dimension + 1) + lwe_dimension]);
  }
 }

 template <typename T>
-void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
+void print_body(const char *name, T *src, int n, int lwe_dimension) {
  printf("%s: ", name);
  cudaDeviceSynchronize();
-  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
+  print_body_kernel<<<1, 1>>>(src, n, lwe_dimension);
  cudaDeviceSynchronize();
  printf("\n");
 }
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -9,7 +9,7 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
                                 uint32_t gpu_count, std::vector<Torus *> &dest,
                                 uint32_t elements_per_gpu,
-                                 uint64_t &size_tracker_on_gpu_0,
+                                 uint64_t *size_tracker_on_gpu_0,
                                 bool allocate_gpu_memory) {

  dest.resize(gpu_count);
@@ -17,10 +17,10 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
    uint64_t size_tracker_on_gpu_i = 0;
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i],
-        size_tracker_on_gpu_i, allocate_gpu_memory);
+        &size_tracker_on_gpu_i, allocate_gpu_memory);
    dest[i] = d_array;
-    if (i == 0) {
-      size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
+    if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
+      *size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
    }
  }
 }
@@ -46,7 +46,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
                               uint32_t const *gpu_indexes, uint32_t gpu_count,
                               std::vector<Torus *> &dest, uint32_t num_inputs,
                               uint32_t lwe_size,
-                               uint64_t &size_tracker_on_gpu_0,
+                               uint64_t *size_tracker_on_gpu_0,
                               bool allocate_gpu_memory) {
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
@@ -54,10 +54,10 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
-        size_tracker_on_gpu_i, allocate_gpu_memory);
+        &size_tracker_on_gpu_i, allocate_gpu_memory);
    dest[i] = d_array;
-    if (i == 0) {
-      size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
+    if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
+      *size_tracker_on_gpu_0 = size_tracker_on_gpu_i;
    }
  }
 }
@@ -65,7 +65,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
 template void multi_gpu_alloc_lwe_async<__uint128_t>(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
-    uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
+    uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
    bool allocate_gpu_memory);

 /// Allocates the input/output vector for all devices
@@ -75,7 +75,7 @@ template <typename Torus>
 void multi_gpu_alloc_lwe_many_lut_output_async(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
-    uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
+    uint32_t num_many_lut, uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
    bool allocate_gpu_memory) {
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
@@ -83,10 +83,10 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
-        gpu_indexes[i], size_tracker, allocate_gpu_memory);
+        gpu_indexes[i], &size_tracker, allocate_gpu_memory);
    dest[i] = d_array;
-    if (i == 0) {
-      size_tracker_on_gpu_0 += size_tracker;
+    if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
+      *size_tracker_on_gpu_0 = size_tracker;
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/zk/expand.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/expand.cu
@@ -3,8 +3,8 @@
 void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
                        void *lwe_array_out, const void *lwe_compact_array_in,
                        uint32_t lwe_dimension, uint32_t num_lwe,
-                        const uint32_t *lwe_compact_input_indexes,
-                        const uint32_t *output_body_id_per_compact_list) {
+                        const void *lwe_compact_input_indexes,
+                        const void *output_body_id_per_compact_list) {

  switch (lwe_dimension) {
  case 256:
@@ -12,49 +12,56 @@ void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 512:
    host_lwe_expand<uint64_t, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 1024:
    host_lwe_expand<uint64_t, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 2048:
    host_lwe_expand<uint64_t, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 4096:
    host_lwe_expand<uint64_t, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 8192:
    host_lwe_expand<uint64_t, AmortizedDegree<8192>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  case 16384:
    host_lwe_expand<uint64_t, AmortizedDegree<16384>>(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_compact_array_in), num_lwe,
-        lwe_compact_input_indexes, output_body_id_per_compact_list);
+        static_cast<const uint32_t *>(lwe_compact_input_indexes),
+        static_cast<const uint32_t *>(output_body_id_per_compact_list));
    break;
  default:
    PANIC("CUDA error: lwe_dimension not supported."
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -34,7 +34,8 @@ __host__ void host_expand_without_verification(
  if (sizeof(Torus) == 8) {
    cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
                       lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
-                       d_lwe_compact_input_indexes, d_body_id_per_compact_list);
+                       (void *)d_lwe_compact_input_indexes,
+                       (void *)d_body_id_per_compact_list);

  } else
    PANIC("Cuda error: expand is only supported on 64 bits")
@@ -94,7 +95,7 @@ __host__ uint64_t scratch_cuda_expand_without_verification(
  *mem_ptr = new zk_expand_mem<Torus>(
      streams, gpu_indexes, gpu_count, computing_params, casting_params,
      casting_key_type, num_lwes_per_compact_list, is_boolean_array,
-      num_compact_lists, allocate_gpu_memory, size_tracker);
+      num_compact_lists, allocate_gpu_memory, &size_tracker);
  return size_tracker;
 }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Guillermo Oyarzun	35b3430404	chore(gpu): fix some pke bugs	2025-06-19 12:22:39 +02:00
Guillermo Oyarzun	a56a8107ab	chore(gpu): reduce the pool size	2025-06-17 13:20:58 +02:00
Guillermo Oyarzun	b322708cbd	chore(gpu): add drift to pke gpu tests	2025-06-16 17:36:11 +02:00
Guillermo Oyarzun	25da5ef721	chore(gpu): add pke gpu tests	2025-06-16 15:17:54 +02:00
Guillermo Oyarzun	43b95aaf7c	chore(gpu): fix input_br variance formula for multi-bit	2025-06-16 12:27:27 +02:00
Guillermo Oyarzun	cfc19d3bd2	chore(gpu): correct ms formula in pbs128	2025-06-16 12:27:27 +02:00
Guillermo Oyarzun	51427fc9ae	chore(gpu): add multiplication and extra logs to pbs128 test	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	a45baaa3d0	chore(gpu): fixes after rebase	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	a24adf528b	chore(gpu): use the m128 params	2025-06-16 12:27:26 +02:00
Guillermo Oyarzun	3ddf9bdba6	chore(gpu): fix pbs128 after rebasing	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	de4902fb9f	chore(gpu): add noise squashing params without using it	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	b89bca0d13	chore(gpu): fix pbs128 with old params	2025-06-16 12:27:25 +02:00
Guillermo Oyarzun	f0dc0e18ab	chore(gpu): add pbs128	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	98ca66581e	chore(gpu): fix errors after rebasing	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	c94ccc3a23	feat(gpu): fix bugs and add extra logs	2025-06-16 12:27:24 +02:00
Guillermo Oyarzun	7e6573a1d2	feat(gpu): fix compression tests after rebase	2025-06-16 12:27:23 +02:00
Guillermo Oyarzun	94ff21b089	feat(gpu): add noise checks with multi-bit pbs	2025-06-16 12:27:23 +02:00
Guillermo Oyarzun	71420f0d92	feat(gpu): add noise checks	2025-06-16 12:27:23 +02:00
Arthur Meyre	d81bd4ebd6	wip: long pfail runs	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	83d1d6a46c	wip: fix packing ks noise formula	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	ac693f97e1	wip: remove check that makes compression pfail test fail	2025-06-16 12:27:22 +02:00
Nicolas Sarlin	d06656cfb4	wip: remove check that makes compression pfail test fail	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	472ea682ae	wip: update noise formulas	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	258524f5e2	chore(zk): add noise tests for zkv1	2025-06-16 12:27:21 +02:00
Nicolas Sarlin	6b98865515	fix: new param naming	2025-06-16 12:27:20 +02:00
Arthur Meyre	e47731b1ee	fix(shortint): fix sample count in pbs128 pfail	2025-06-16 12:27:20 +02:00
Arthur Meyre	d1b9bc676d	test(shortint): add variance check after KS in classic PBS AP	2025-06-16 12:27:19 +02:00
Arthur Meyre	405952e323	test(shortint): add normality check after KS in classic AP	2025-06-16 12:27:19 +02:00
Arthur Meyre	7556a8e05f	fix(test): fix slighlty wrong log message	2025-06-16 12:27:19 +02:00
Arthur Meyre	ebda1426e4	fix(test): fix test parameters for PBS 128 Disable the bound check as our computation disagrees with the RO	2025-06-16 12:27:18 +02:00
Arthur Meyre	da91075b26	test(shortint): add pfail estimation after ms in compression	2025-06-16 12:27:18 +02:00
Arthur Meyre	f95eb2cf2c	wip: re-exported pbs 128 with symbolic mantissa - add pbs 128 params with a mantissa setting - noise with this settings is in line with RO prediction	2025-06-16 12:27:17 +02:00
Arthur Meyre	45da14c7dd	wip: add pbs 128 tests (noise and pfail)	2025-06-16 12:27:16 +02:00
Arthur Meyre	75e03ae800	feat(core): add pbs 128 noise formulas	2025-06-16 12:27:16 +02:00
Arthur Meyre	9976cbe1f2	test(shortint): add pfail measurement for full compression + AP	2025-06-16 12:27:15 +02:00
Arthur Meyre	a9006486e8	test(shortint): slightly change decryption logic to also get the padding bit out	2025-06-16 12:27:14 +02:00
Arthur Meyre	7f8778f178	test(shortint): add pfail check for first part of compression	2025-06-16 12:27:14 +02:00
Arthur Meyre	370e4ae2e6	test(shortint): add first part of the compression AP	2025-06-16 12:27:13 +02:00
Arthur Meyre	b6db9d8ba0	chore: rename helper function to match test usage	2025-06-16 12:27:12 +02:00
Arthur Meyre	54b139b1b5	feat(core): add noise formula for packing keyswitch	2025-06-16 12:27:12 +02:00
Arthur Meyre	d3ccf08f2c	test(shortint): add pfail test for PKE -> KS -> (DP -> KS Compute) -> MS	2025-06-16 12:27:11 +02:00
Arthur Meyre	6f1492766f	refactor(test): also accept PKE -> KS to big -> DP -> KS -> MS	2025-06-16 12:27:10 +02:00
Arthur Meyre	03fa607209	test(shortint): add noise check for PKE -> KS to small key + MS	2025-06-16 12:27:09 +02:00
Arthur Meyre	29ab6c0709	wip: hotfix for borrow mut error, this needs a design pass/reflection	2025-06-16 12:27:09 +02:00
Arthur Meyre	1c0b428cd3	feat: add noise formulas for TUniform 132 bits of security	2025-06-16 12:27:08 +02:00
Arthur Meyre	dab2d39749	test(hl): add noise check for CompactPublicKey encryption w/ TUniform param	2025-06-16 12:27:07 +02:00
Arthur Meyre	ea81ef5d15	test(shortint): add pfail test for the classic AP	2025-06-16 12:27:06 +02:00
Arthur Meyre	57a7a5a084	refactor: prepare code factorization for noise and pfail	2025-06-16 12:27:06 +02:00
Arthur Meyre	9010ded3d5	test: add shortint atomic pattern noise measurement	2025-06-16 12:27:05 +02:00
Arthur Meyre	1ebd2848ad	test(hl): test encryption noise of FheUint ciphertext in HL API	2025-06-16 12:27:04 +02:00
Nicolas Sarlin	6a1a024e6d	chore(zk)!: store inside the pke params the supported zk scheme BREAKING_CHANGE: - Zk for compact PKE now requires dedicated encryption parameters	2025-06-16 12:27:04 +02:00