print

debug
2026-04-28 03:01:21 -04:00 · 2025-07-16 11:18:32 +02:00 · 2025-07-16 11:13:44 +02:00 · 2025-07-16 11:09:29 +02:00 · 2025-07-16 11:04:00 +02:00 · 2025-07-16 10:56:45 +02:00
349 changed files with 12710 additions and 2678 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,3 @@
 *.hpu filter=lfs diff=lfs merge=lfs -text
+*.bcode filter=lfs diff=lfs merge=lfs -text
+*.cbor filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -71,45 +71,26 @@ jobs:
        with:
          toolchain: stable

-      - name: Use specific data branch
-        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
-        env:
-          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+      # Cache key is an aggregated hash of lfs files hashes
+      - name: Get LFS data sha
+        id: hash-lfs-data
        run: |
-          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
-
-      - name: Get backward compat branch
-        id: backward_compat_branch
-        run: |
-          BRANCH="$(make backward_compat_branch)"
-          echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
-
-      - name: Get backward compat branch head SHA
-        id: backward_compat_sha
-        run: |
-          SHA=$(git ls-remote "${REPO_URL}" refs/heads/"${BACKWARD_COMPAT_BRANCH}" | awk '{print $1}')
+          SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data | sha256sum | cut -d' ' -f1)
          echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
-        env:
-          REPO_URL: "https://github.com/zama-ai/tfhe-backward-compat-data"
-          BACKWARD_COMPAT_BRANCH: ${{ steps.backward_compat_branch.outputs.branch }}

      - name: Retrieve data from cache
        id: retrieve-data-cache
        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: tests/tfhe-backward-compat-data
-          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
+          path: |
+            utils/tfhe-backward-compat-data/**/*.cbor
+            utils/tfhe-backward-compat-data/**/*.bcode
+          key: ${{ steps.hash-lfs-data.outputs.sha }}

-      - name: Clone test data
+      - name: Pull test data
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-          repository: zama-ai/tfhe-backward-compat-data
-          path: tests/tfhe-backward-compat-data
-          lfs: 'true'
-          ref: ${{ steps.backward_compat_branch.outputs.branch }}
+        run: |
+          make pull_backward_compat_data

      - name: Run backward compatibility tests
        run: |
@@ -120,8 +101,10 @@ jobs:
        continue-on-error: true
        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: tests/tfhe-backward-compat-data
-          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
+          path: |
+            utils/tfhe-backward-compat-data/**/*.cbor
+            utils/tfhe-backward-compat-data/**/*.bcode
+          key: ${{ steps.hash-lfs-data.outputs.sha }}

      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -103,7 +103,7 @@ jobs:
    name: Unsigned integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -104,7 +104,7 @@ jobs:
    name: Signed integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,6 +31,7 @@ on:
          - ks
          - ks_pbs
          - integer_zk
+          - hlapi_noise_squash
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -10,37 +10,16 @@ on:
 permissions: {}

 jobs:
-  run-benchmarks-1-h100:
-    name: Run integer benchmarks (1xH100)
+  run-benchmarks-8-h100-sxm5-integer:
+    name: Run integer benchmarks (8xH100-SXM5)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: single-h100
-      hardware_name: n3-H100x1
-      command: integer,integer_multi_bit
-      op_flavor: default
-      bench_type: latency
-      all_precisions: true
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-2-h100:
-    name: Run integer benchmarks (2xH100)
-    if: github.repository == 'zama-ai/tfhe-rs'
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: 2-h100
-      hardware_name: n3-H100x2
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
      command: integer_multi_bit
      op_flavor: default
-      bench_type: latency
+      bench_type: both
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -52,16 +31,16 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-8-h100:
-    name: Run integer benchmarks (8xH100)
+  run-benchmarks-8-h100-sxm5-integer-compression:
+    name: Run integer compression benchmarks (8xH100-SXM5)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100
-      hardware_name: n3-H100x8
-      command: integer_multi_bit
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
+      command: integer_compression
      op_flavor: default
-      bench_type: latency
+      bench_type: both
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -73,16 +52,37 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-l40:
-    name: Run integer benchmarks (L40)
+  run-benchmarks-8-h100-sxm5-integer-zk:
+    name: Run integer zk benchmarks (8xH100-SXM5)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: l40
-      hardware_name: n3-L40x1
-      command: integer_multi_bit,integer_compression,pbs,ks
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
+      command: integer_zk
      op_flavor: default
-      bench_type: latency
+      bench_type: both
+      all_precisions: true
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100-sxm5-noise-squash:
+    name: Run integer zk benchmarks (8xH100-SXM5)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
+      command: hlapi_noise_squash
+      op_flavor: default
+      bench_type: both
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -62,7 +62,7 @@ jobs:

      - name: Run benchmarks
        run: |
-          git lfs pull --include="*" --exclude=""
+          make pull_hpu_files
          make bench_integer_hpu
          make bench_hlapi_erc20_hpu

--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute FFT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute NTT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
  cancel-in-progress: true

 permissions:
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
  cancel-in-progress: true

 permissions:
--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -1,63 +0,0 @@
-name: Close or Merge corresponding PR on the data repo
-
-# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
-
-env:
-  DATA_REPO: zama-ai/tfhe-backward-compat-data
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  PR_BRANCH: ${{ github.head_ref || github.ref_name }}
-  CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
-
-# only trigger on pull request closed events
-on:
-  pull_request:
-    types: [ closed ]
-
-permissions: {}
-
-jobs:
-  auto_close_job:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
-    runs-on: ubuntu-latest
-    env:
-      GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}  # Needed for gh CLI commands
-    steps:
-    - name: Fetch PR number
-      run: |
-        PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
-        echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"
-
-    - name: Comment on the PR to indicate the reason of the close
-      run: |
-        gh pr comment "${PR_BRANCH}" \
-        --repo "${DATA_REPO}" \
-        --body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
-      env:
-        REPO: ${{ github.repository }}
-        EVENT_NUMBER: ${{ github.event.number }}
-
-    - name: Merge the Pull Request in the data repo
-      if: ${{ github.event.pull_request.merged }}
-      run: |
-        gh pr merge "${PR_BRANCH}" \
-        --repo "${DATA_REPO}" \
-        --rebase \
-        --delete-branch
-
-    - name: Close the Pull Request in the data repo
-      if: ${{ !github.event.pull_request.merged }}
-      run: |
-        gh pr close "${PR_BRANCH}" \
-        --repo "${DATA_REPO}" \
-        --delete-branch
-
-    - name: Slack Notification
-      if: ${{ always() && job.status == 'failure' }}
-      continue-on-error: true
-      uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-      env:
-        SLACK_COLOR: ${{ job.status }}
-        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -137,7 +137,7 @@ jobs:
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE NO_BIG_PARAMS_GPU=TRUE make test_integer_multi_bit_gpu_ci

      - name: Run user docs tests
        run: |
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -25,9 +25,6 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-  schedule:
-    # Nightly tests @ 1AM after each work day
-    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -25,9 +25,6 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-  schedule:
-    # Nightly tests @ 1AM after each work day
-    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
  cancel-in-progress: true


--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -21,7 +21,7 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: git-sync
-        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
+        uses: valtech-sd/git-sync@e734cfe9485a92e720eac5af8a4555dde5fecf88
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "main"
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -0,0 +1,26 @@
+name: 'Close unverified PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+permissions: {}
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: read
+      pull-requests: write
+    steps:
+      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+        with:
+          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
+          days-before-stale: 2
+          days-before-close: 0
+          # We are not interested in suppressing issues so have a currently non existent label
+          # if we ever accept issues to become stale/closable this label will be the signal for that
+          only-issue-labels: can-be-auto-closed
+          # Only unverified PRs are an issue
+          exempt-pr-labels: cla-signed
+          # We don't want people commenting to keep an unverified PR
+          ignore-updates: true
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -10,6 +10,7 @@ ignore:
  - keys
  - coverage
  - utils/tfhe-lints/ui/main.stderr
+  - utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated

 rules:
  # checks if file ends in a newline character
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -170,6 +170,8 @@ On the contrary, these changes are *not* data breaking:
 * Renaming a type (unless it implements the `Named` trait).
 * Adding a variant to the end of an enum.

+Historical data from previous TFHE-rs versions are stored inside `utils/tfhe-backward-compat-data`. They are used to check on every PR that backward compatibility has been preserved.
+
 ## Example: adding a field

 Suppose you want to add an i32 field to a type named `MyType`. The original type is defined as:
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
 ]

 exclude = [
-    "tests/backward_compatibility_tests",
+    "utils/tfhe-backward-compat-data",
    "utils/tfhe-lints",
    "apps/trivium",
 ]
--- a/93
+++ b/93
@@ -22,21 +22,7 @@ BENCH_TYPE?=latency
 BENCH_PARAM_TYPE?=classical
 BENCH_PARAMS_SET?=default
 NODE_VERSION=22.6
-BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_DEFAULT_BRANCH:=$(shell ./scripts/backward_compat_data_version.py)
-BACKWARD_COMPAT_DATA_BRANCH?=$(BACKWARD_COMPAT_DATA_DEFAULT_BRANCH)
-BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
-BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
-ifeq ($(BACKWARD_COMPAT_DATA_DEFAULT_BRANCH), $(BACKWARD_COMPAT_DATA_BRANCH))
-	BACKWARD_COMPAT_CLIPPY_PATCH=
-else
-# We need to override the url for cargo patch accept it, see: https://github.com/rust-lang/cargo/issues/5478
-	BACKWARD_COMPAT_PATCHED_URL=https://www.github.com/zama-ai/tfhe-backward-compat-data.git
-	BACKWARD_COMPAT_CLIPPY_PATCH=\
-		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).branch=\"$(BACKWARD_COMPAT_DATA_BRANCH)\"" \
-		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).git=\"$(BACKWARD_COMPAT_PATCHED_URL)\""
-endif
-
+BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
 TFHE_SPEC:=tfhe
 WASM_PACK_VERSION="0.13.1"
 # We are kind of hacking the cut here, the version cannot contain a quote '"'
@@ -170,23 +156,23 @@ install_tarpaulin: install_rs_build_toolchain

 .PHONY: install_cargo_dylint # Install custom tfhe-rs lints
 install_cargo_dylint:
-	cargo install cargo-dylint dylint-link
+	cargo install --locked cargo-dylint dylint-link

 .PHONY: install_typos_checker # Install typos checker
 install_typos_checker: install_rs_build_toolchain
 	@typos --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked typos-cli || \
 	( echo "Unable to install typos-cli, unknown error." && exit 1 )

 .PHONY: install_zizmor # Install zizmor workflow security checker
 install_zizmor: install_rs_build_toolchain
 	@zizmor --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked zizmor --version ~1.9 || \
 	( echo "Unable to install zizmor, unknown error." && exit 1 )

-.PHONY: install_cargo_cross # Install custom tfhe-rs lints
+.PHONY: install_cargo_cross # Install cross for big endian tests
 install_cargo_cross: install_rs_build_toolchain
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked cross

 .PHONY: setup_venv # Setup Python virtualenv for wasm tests
 setup_venv:
@@ -263,6 +249,9 @@ install_mlc: install_rs_build_toolchain
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt

 .PHONY: fmt_js # Format javascript code
 fmt_js: check_nvm_installed
@@ -284,6 +273,9 @@ fmt_c_tests:
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check

 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
@@ -453,7 +445,6 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_ws_tests # Run clippy on the workspace level tests
 clippy_ws_tests: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --tests \
-		$(BACKWARD_COMPAT_CLIPPY_PATCH) \
 		-p tests --features=shortint,integer,zk-pok -- --no-deps -D warnings

 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
@@ -495,10 +486,22 @@ clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p param_dedup -- --no-deps -D warnings

+.PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
+clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
+	@# Some old crates are x86 specific, only run in that case
+	@if uname -a | grep -q x86; then \
+		RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
+			-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all-targets \
+			-- --no-deps -D warnings; \
+	else \
+		echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
+	fi
+
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup
+clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
+clippy_backward_compat_data

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -515,10 +518,10 @@ clippy_hpu_backend: install_rs_check_toolchain
 		-p tfhe-hpu-backend -- --no-deps -D warnings

 .PHONY: clippy_hpu_mockup # Run clippy lints on tfhe-hpu-mockup
-clippy_hpu: install_rs_check_toolchain
+clippy_hpu_mockup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--all-targets \
-		-p tfhe-hpu-backend -- --no-deps -D warnings
+		-p tfhe-hpu-mockup -- --no-deps -D warnings

 .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
 check_rust_bindings_did_not_change:
@@ -1054,16 +1057,11 @@ test_tfhe_lints: install_cargo_dylint
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
 .PHONY: test_backward_compatibility_ci
 test_backward_compatibility_ci: install_rs_build_toolchain
-	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tests/$(BACKWARD_COMPAT_DATA_DIR)\"" \
+	TFHE_BACKWARD_COMPAT_DATA_DIR="../$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
-test_backward_compatibility: tests/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
-
-.PHONY: backward_compat_branch # Prints the required backward compatibility branch
-backward_compat_branch:
-	@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
+test_backward_compatibility: pull_backward_compat_data test_backward_compatibility_ci

 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
@@ -1108,6 +1106,10 @@ check_intra_md_links: install_mlc
 check_md_links: install_mlc
 	mlc --match-file-extension tfhe/docs

+.PHONY: check_doc_paths_use_dash # Check paths use "-" instead of "_" in docs for gitbook compatibility
+check_doc_paths_use_dash:
+	python3 ./scripts/check_doc_paths_use_dash.py
+
 .PHONY: check_parameter_export_ok # Checks exported "current" shortint parameter module is correct
 check_parameter_export_ok:
 	python3 ./scripts/check_current_param_export.py
@@ -1465,6 +1467,20 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

+.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
+bench_hlapi_noise_squash: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_noise_squash_gpu # Run benchmarks for noise squash operation on GPU
+bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
 #
 # Utility tools
 #
@@ -1522,11 +1538,13 @@ write_params_to_file: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
 	--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache

-.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
-clone_backward_compat_data:
-	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tests/$(BACKWARD_COMPAT_DATA_DIR)
+.PHONY: pull_backward_compat_data # Pull the data files needed for backward compatibility tests
+pull_backward_compat_data:
+	./scripts/pull_lfs_data.sh $(BACKWARD_COMPAT_DATA_DIR)

-tests/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
+.PHONY: pull_hpu_files # Pull the hpu files
+pull_hpu_files:
+	./scripts/pull_lfs_data.sh backends/tfhe-hpu-backend/

 #
 # Real use case examples
@@ -1552,7 +1570,8 @@ sha256_bool: install_rs_check_toolchain

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links clippy_all check_compile_tests test_tfhe_lints \
+check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
+clippy_all check_compile_tests test_tfhe_lints \
 tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
@@ -1564,7 +1583,7 @@ pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested clippy_fast check_compile_tests
+check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash clippy_fast check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt fmt_js
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -49,7 +49,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index);

 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t *size_tracker,
+                                           uint64_t &size_tracker,
                                           bool allocate_gpu_memory);

 void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -20,7 +20,7 @@ template <typename Torus> struct int_compression {
                  uint32_t gpu_count, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                  uint64_t *size_tracker) {
+                  uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
    this->lwe_per_glwe = lwe_per_glwe;
@@ -38,7 +38,7 @@ template <typename Torus> struct int_compression {
        lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
        gpu_indexes[0], size_tracker, allocate_gpu_memory);

-    *size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
        streams[0], gpu_indexes[0], &fp_ks_buffer,
        compression_params.small_lwe_dimension,
        compression_params.glwe_dimension, compression_params.polynomial_size,
@@ -76,7 +76,7 @@ template <typename Torus> struct int_decompression {
                    int_radix_params compression_params,
                    uint32_t num_radix_blocks, uint32_t body_count,
                    uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                    uint64_t *size_tracker) {
+                    uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -638,5 +638,95 @@ void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
                                                uint32_t const *gpu_indexes,
                                                uint32_t gpu_count,
                                                int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t num_scalar_bits, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_power_of_two, bool multiplier_is_small,
+    bool allocate_ms_array);
+
+void cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *ksks,
+    void *const *bsks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_power_of_two, bool multiplier_is_small,
+    uint32_t l, uint32_t shift_post, bool is_rhs_power_of_two, bool is_rhs_zero,
+    bool is_rhs_one, uint32_t rhs_shift, uint32_t numerator_bits,
+    uint32_t num_scalars, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set);
+
+void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool is_divisor_power_of_two,
+    bool log2_divisor_exceeds_threshold, bool multiplier_exceeds_threshold,
+    uint32_t num_scalar_bits_for_div, uint32_t num_scalar_bits_for_mul,
+    uint32_t ilog2_divisor, uint64_t divisor, bool allocate_ms_array);
+
+void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
+    int8_t *mem_ptr, void *const *ksks, void *const *bsks,
+    uint64_t const *decomposed_scalar_for_div,
+    uint64_t const *decomposed_scalar_for_mul,
+    uint64_t const *has_at_least_one_set_for_div,
+    uint64_t const *has_at_least_one_set_for_mul,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    uint32_t num_scalars_for_div, uint32_t num_scalars_for_mul,
+    bool multiplier_exceeds_threshold, bool is_divisor_power_of_two,
+    bool log2_divisor_exceeds_threshold, uint32_t ilog2_divisor,
+    uint64_t divisor, uint64_t shift_pre, uint32_t shift_post, uint64_t rhs,
+    void const *clear_blocks, void const *h_clear_blocks,
+    uint32_t num_clear_blocks);
+
+void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    uint32_t num_scalar_bits_for_div, uint32_t num_scalar_bits_for_mul,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_absolute_divisor_power_of_two,
+    bool is_divisor_zero, bool multiplier_is_small, bool allocate_ms_array);
+
+void cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
+    int8_t *mem_ptr, void *const *ksks, void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool is_divisor_zero, bool l_exceed_threshold,
+    bool is_absolute_divisor_power_of_two, bool multiplier_is_small, uint32_t l,
+    uint32_t shift_post, bool is_rhs_power_of_two, bool is_rhs_zero,
+    bool is_rhs_one, uint32_t rhs_shift, uint32_t divisor_shift,
+    uint32_t numerator_bits, uint32_t num_scalars_for_div,
+    uint32_t num_scalars_for_mul, uint64_t const *decomposed_scalar_for_div,
+    uint64_t const *decomposed_scalar_for_mul,
+    uint64_t const *has_at_least_one_set_for_div,
+    uint64_t const *has_at_least_one_set_for_mul);
+
+void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -289,7 +289,7 @@ template <typename Torus> struct int_radix_lut {
  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                uint64_t *size_tracker) {
+                uint64_t &size_tracker) {

    this->params = params;
    this->num_blocks = num_radix_blocks;
@@ -305,6 +305,7 @@ template <typename Torus> struct int_radix_lut {
    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      printf("Active GPUs in int_radix_lut: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
      int8_t *gpu_pbs_buffer;
@@ -316,9 +317,9 @@ template <typename Torus> struct int_radix_lut {
          streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
          params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
          params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
-          allocate_gpu_memory, params.allocate_ms_array, &size);
-      if (i == 0 && size_tracker != nullptr) {
-        *size_tracker += size;
+          allocate_gpu_memory, params.allocate_ms_array, size);
+      if (i == 0) {
+        size_tracker += size;
      }
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      buffer.push_back(gpu_pbs_buffer);
@@ -412,7 +413,7 @@ template <typename Torus> struct int_radix_lut {
  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                uint32_t num_radix_blocks, int_radix_lut *base_lut_object,
-                bool allocate_gpu_memory, uint64_t *size_tracker) {
+                bool allocate_gpu_memory, uint64_t &size_tracker) {

    this->params = params;
    this->num_blocks = num_radix_blocks;
@@ -447,6 +448,7 @@ template <typename Torus> struct int_radix_lut {
    // LUT is used as a trivial encryption and must be initialized outside
    // this constructor
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_radix_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    for (uint i = 0; i < active_gpu_count; i++) {
      auto lut = (Torus *)cuda_malloc_with_size_tracking_async(
          num_luts * lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
@@ -504,7 +506,7 @@ template <typename Torus> struct int_radix_lut {
  int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
                uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
                uint32_t num_radix_blocks, uint32_t num_many_lut,
-                bool allocate_gpu_memory, uint64_t *size_tracker) {
+                bool allocate_gpu_memory, uint64_t &size_tracker) {

    this->num_many_lut = num_many_lut;
    this->params = params;
@@ -520,6 +522,7 @@ template <typename Torus> struct int_radix_lut {

    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_radix_lut 2: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
@@ -532,9 +535,9 @@ template <typename Torus> struct int_radix_lut {
          streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
          params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
          params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
-          allocate_gpu_memory, params.allocate_ms_array, &size);
+          allocate_gpu_memory, params.allocate_ms_array, size);
      if (i == 0) {
-        *size_tracker += size;
+        size_tracker += size;
      }
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      buffer.push_back(gpu_pbs_buffer);
@@ -808,7 +811,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {
                          uint32_t input_polynomial_size,
                          uint32_t num_radix_blocks,
                          uint32_t original_num_blocks,
-                          bool allocate_gpu_memory, uint64_t *size_tracker) {
+                          bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->params = params;
    this->num_blocks = num_radix_blocks;
    gpu_memory_allocated = allocate_gpu_memory;
@@ -827,6 +830,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {

    ///////////////
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_noise_squash_lut 1: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
      cuda_set_device(i);
@@ -838,10 +842,10 @@ template <typename InputTorus> struct int_noise_squashing_lut {
                              params.small_lwe_dimension, params.glwe_dimension,
                              params.polynomial_size, params.pbs_level,
                              num_radix_blocks_on_gpu, allocate_gpu_memory,
-                              params.allocate_ms_array, &size);
+                              params.allocate_ms_array, size);
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-      if (i == 0 && size_tracker != nullptr) {
-        *size_tracker += size;
+      if (i == 0) {
+        size_tracker += size;
      }
      pbs_buffer.push_back(gpu_pbs_buffer);
    }
@@ -968,7 +972,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                              int_radix_params params, uint32_t bits_per_block,
                              uint32_t final_offset, uint32_t num_radix_blocks,
                              bool allocate_gpu_memory,
-                              uint64_t *size_tracker) {
+                              uint64_t &size_tracker) {
    this->params = params;
    gpu_memory_allocated = allocate_gpu_memory;

@@ -1040,7 +1044,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
                              uint32_t const *gpu_indexes, uint32_t gpu_count,
                              int_radix_params params, uint32_t bits_per_block,
                              uint32_t num_radix_blocks,
-                              bool allocate_gpu_memory, uint64_t *size_tracker)
+                              bool allocate_gpu_memory, uint64_t &size_tracker)
      : int_bit_extract_luts_buffer(streams, gpu_indexes, gpu_count, params,
                                    bits_per_block, 0, num_radix_blocks,
                                    allocate_gpu_memory, size_tracker) {}
@@ -1078,7 +1082,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
                              int_radix_params params,
                              uint32_t num_radix_blocks,
                              bool allocate_gpu_memory,
-                              uint64_t *size_tracker) {
+                              uint64_t &size_tracker) {
    this->shift_type = shift_type;
    this->is_signed = is_signed;
    this->params = params;
@@ -1227,7 +1231,7 @@ template <typename Torus> struct int_fullprop_buffer {

  int_fullprop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                      uint32_t gpu_count, int_radix_params params,
-                      bool allocate_gpu_memory, uint64_t *size_tracker) {
+                      bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->params = params;
    gpu_memory_allocated = allocate_gpu_memory;
    lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2, 2,
@@ -1312,7 +1316,7 @@ template <typename Torus> struct int_overflowing_sub_memory {
  int_overflowing_sub_memory(cudaStream_t const *streams,
                             uint32_t const *gpu_indexes, uint32_t gpu_count,
                             int_radix_params params, uint32_t num_radix_blocks,
-                             bool allocate_gpu_memory, uint64_t *size_tracker) {
+                             bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->params = params;
    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
@@ -1459,7 +1463,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {

  void setup_index_buffers(cudaStream_t const *streams,
                           uint32_t const *gpu_indexes,
-                           uint64_t *size_tracker) {
+                           uint64_t &size_tracker) {

    d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
        max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
@@ -1470,7 +1474,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
    auto setup_columns =
        [num_blocks_in_radix, max_num_radix_in_vec, streams,
         gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
-                      uint32_t *&columns_counter, uint64_t *size_tracker,
+                      uint32_t *&columns_counter, uint64_t &size_tracker,
                      bool gpu_memory_allocated) {
          columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
              num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
@@ -1524,7 +1528,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
        uint64_t size_tracker = 0;
        luts_message_carry =
            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
-                                     pbs_count, true, &size_tracker);
+                                     pbs_count, true, size_tracker);
        allocated_luts_message_carry = true;
      }
    }
@@ -1561,7 +1565,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
      uint32_t max_num_radix_in_vec,
      bool reduce_degrees_for_single_carry_propagation,
-      bool allocate_gpu_memory, uint64_t *size_tracker) {
+      bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->params = params;
    this->mem_reuse = false;
    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
@@ -1604,7 +1608,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
      CudaRadixCiphertextFFI *small_lwe_vector,
      int_radix_lut<Torus> *reused_lut,
      bool reduce_degrees_for_single_carry_propagation,
-      bool allocate_gpu_memory, uint64_t *size_tracker) {
+      bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->mem_reuse = true;
    this->params = params;
    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
@@ -1668,7 +1672,7 @@ template <typename Torus> struct int_seq_group_prop_memory {
                            uint32_t const *gpu_indexes, uint32_t gpu_count,
                            int_radix_params params, uint32_t group_size,
                            uint32_t big_lwe_size_bytes,
-                            bool allocate_gpu_memory, uint64_t *size_tracker) {
+                            bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
@@ -1728,7 +1732,7 @@ template <typename Torus> struct int_hs_group_prop_memory {
                           uint32_t const *gpu_indexes, uint32_t gpu_count,
                           int_radix_params params, uint32_t num_groups,
                           uint32_t big_lwe_size_bytes,
-                           bool allocate_gpu_memory, uint64_t *size_tracker) {
+                           bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
@@ -1783,7 +1787,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
-      uint64_t *size_tracker) {
+      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
@@ -1982,7 +1986,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory,
-      uint64_t *size_tracker) {
+      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
@@ -2288,7 +2292,7 @@ template <typename Torus> struct int_sc_prop_memory {
                     uint32_t gpu_count, int_radix_params params,
                     uint32_t num_radix_blocks, uint32_t requested_flag_in,
                     uint32_t uses_carry, bool allocate_gpu_memory,
-                     uint64_t *size_tracker) {
+                     uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    auto glwe_dimension = params.glwe_dimension;
@@ -2472,6 +2476,8 @@ template <typename Torus> struct int_sc_prop_memory {
    release_radix_ciphertext_async(streams[0], gpu_indexes[0], output_flag,
                                   gpu_memory_allocated);
    lut_message_extract->release(streams, gpu_indexes, gpu_count);
+    delete shifted_blocks_state_mem;
+    delete prop_simu_group_carries_mem;
    delete output_flag;
    delete lut_message_extract;

@@ -2500,7 +2506,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
-      uint64_t *size_tracker) {
+      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
@@ -2718,7 +2724,7 @@ template <typename Torus> struct int_borrow_prop_memory {
                         uint32_t const *gpu_indexes, uint32_t gpu_count,
                         int_radix_params params, uint32_t num_radix_blocks,
                         uint32_t compute_overflow_in, bool allocate_gpu_memory,
-                         uint64_t *size_tracker) {
+                         uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    auto glwe_dimension = params.glwe_dimension;
@@ -2787,6 +2793,7 @@ template <typename Torus> struct int_borrow_prop_memory {
    }

    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_borrow_prop: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    sub_streams_1 =
        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
    sub_streams_2 =
@@ -2874,10 +2881,11 @@ template <typename Torus> struct int_zero_out_if_buffer {
  int_zero_out_if_buffer(cudaStream_t const *streams,
                         uint32_t const *gpu_indexes, uint32_t gpu_count,
                         int_radix_params params, uint32_t num_radix_blocks,
-                         bool allocate_gpu_memory, uint64_t *size_tracker) {
+                         bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_zero_out_if: %d, gpu count: %d\n", active_gpu_count, gpu_count);

    tmp = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -2928,7 +2936,7 @@ template <typename Torus> struct int_mul_memory {
                 uint32_t gpu_count, int_radix_params params,
                 bool const is_boolean_left, bool const is_boolean_right,
                 uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                 uint64_t *size_tracker) {
+                 uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->boolean_mul = is_boolean_left || is_boolean_right;
    this->params = params;
@@ -3083,7 +3091,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type,
      int_radix_params params, uint32_t num_radix_blocks,
-      bool allocate_gpu_memory, uint64_t *size_tracker) {
+      bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->shift_type = shift_type;
    this->params = params;
@@ -3167,7 +3175,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
      uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type,
      int_radix_params params, uint32_t num_radix_blocks,
      bool allocate_gpu_memory, CudaRadixCiphertextFFI *pre_allocated_buffer,
-      uint64_t *size_tracker) {
+      uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->shift_type = shift_type;
    this->params = params;
@@ -3280,9 +3288,10 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, SHIFT_OR_ROTATE_TYPE shift_type,
      int_radix_params params, uint32_t num_radix_blocks,
-      bool allocate_gpu_memory, uint64_t *size_tracker) {
+      bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    active_gpu_count = get_active_gpu_count(1, gpu_count);
+      printf("Active GPUs in int_arithmetic_scalar_shift: %d, gpu count: %d\n", active_gpu_count, gpu_count);
    // In the arithmetic shift, a PBS has to be applied to the last rotated
    // block twice: once to shift it, once to compute the padding block to be
    // copied onto all blocks to the left of the last rotated block
@@ -3454,7 +3463,7 @@ template <typename Torus> struct int_cmux_buffer {
                  uint32_t gpu_count,
                  std::function<Torus(Torus)> predicate_lut_f,
                  int_radix_params params, uint32_t num_radix_blocks,
-                  bool allocate_gpu_memory, uint64_t *size_tracker) {
+                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;

    this->params = params;
@@ -3565,7 +3574,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
                                COMPARISON_TYPE op, int_radix_params params,
                                uint32_t num_radix_blocks,
                                bool allocate_gpu_memory,
-                                uint64_t *size_tracker) {
+                                uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    this->op = op;
@@ -3627,7 +3636,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
                           uint32_t const *gpu_indexes, uint32_t gpu_count,
                           COMPARISON_TYPE op, int_radix_params params,
                           uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                           uint64_t *size_tracker) {
+                           uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    this->op = op;
@@ -3728,7 +3737,7 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, std::function<Torus(Torus)> operator_f,
      int_radix_params params, uint32_t num_radix_blocks,
-      bool allocate_gpu_memory, uint64_t *size_tracker) {
+      bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;

@@ -3808,7 +3817,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
                             uint32_t const *gpu_indexes, uint32_t gpu_count,
                             COMPARISON_TYPE op, int_radix_params params,
                             uint32_t num_radix_blocks,
-                             bool allocate_gpu_memory, uint64_t *size_tracker) {
+                             bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    this->op = op;
@@ -3911,13 +3920,14 @@ template <typename Torus> struct int_comparison_buffer {
                        uint32_t const *gpu_indexes, uint32_t gpu_count,
                        COMPARISON_TYPE op, int_radix_params params,
                        uint32_t num_radix_blocks, bool is_signed,
-                        bool allocate_gpu_memory, uint64_t *size_tracker) {
+                        bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    this->op = op;
    this->is_signed = is_signed;

    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+      printf("Active GPUs in int_compar: %d, gpu count: %d\n", active_gpu_count, gpu_count);

    identity_lut_f = [](Torus x) -> Torus { return x; };

@@ -4171,7 +4181,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
  void init_temporary_buffers(cudaStream_t const *streams,
                              uint32_t const *gpu_indexes, uint32_t gpu_count,
                              uint32_t num_blocks, bool allocate_gpu_memory,
-                              uint64_t *size_tracker) {
+                              uint64_t &size_tracker) {

    // non boolean temporary arrays, with `num_blocks` blocks
    remainder1 = new CudaRadixCiphertextFFI;
@@ -4253,7 +4263,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
  void init_lookup_tables(cudaStream_t const *streams,
                          uint32_t const *gpu_indexes, uint32_t gpu_count,
                          uint32_t num_blocks, bool allocate_gpu_memory,
-                          uint64_t *size_tracker) {
+                          uint64_t &size_tracker) {
    uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);

    // create and generate masking_luts_1[] and masking_lut_2[]
@@ -4420,9 +4430,10 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                              uint32_t const *gpu_indexes, uint32_t gpu_count,
                              int_radix_params params, uint32_t num_blocks,
                              bool allocate_gpu_memory,
-                              uint64_t *size_tracker) {
+                              uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
+      printf("Active GPUs in int_div_rem: %d, gpu count: %d\n", active_gpu_count, gpu_count);

    this->params = params;
    shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
@@ -4473,7 +4484,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                                       uint32_t const *gpu_indexes,
                                       uint32_t num_blocks, uint32_t group_size,
                                       bool use_seq, bool allocate_gpu_memory,
-                                       uint64_t *size_tracker) {
+                                       uint64_t &size_tracker) {
    max_indexes_to_erase = num_blocks;

    first_indexes_for_overflow_sub =
@@ -4725,7 +4736,7 @@ template <typename Torus> struct int_bitop_buffer {
  int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                   uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
                   uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                   uint64_t *size_tracker) {
+                   uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
@@ -4813,7 +4824,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
                        uint32_t const *gpu_indexes, uint32_t gpu_count,
                        int_radix_params params, uint32_t num_radix_blocks,
                        uint32_t num_scalar_bits, bool allocate_gpu_memory,
-                        bool anticipated_buffer_drop, uint64_t *size_tracker) {
+                        bool anticipated_buffer_drop, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    this->anticipated_buffers_drop = anticipated_buffer_drop;
@@ -4829,7 +4840,7 @@ template <typename Torus> struct int_scalar_mul_buffer {
    create_zero_radix_ciphertext_async<Torus>(
        streams[0], gpu_indexes[0], preshifted_buffer,
        msg_bits * num_radix_blocks, params.big_lwe_dimension,
-        &anticipated_drop_mem, allocate_gpu_memory);
+        anticipated_drop_mem, allocate_gpu_memory);

    all_shifted_buffer = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -4840,27 +4851,27 @@ template <typename Torus> struct int_scalar_mul_buffer {
    if (num_ciphertext_bits * num_radix_blocks >= num_radix_blocks + 2)
      logical_scalar_shift_buffer = new int_logical_scalar_shift_buffer<Torus>(
          streams, gpu_indexes, gpu_count, LEFT_SHIFT, params, num_radix_blocks,
-          allocate_gpu_memory, all_shifted_buffer, &anticipated_drop_mem);
+          allocate_gpu_memory, all_shifted_buffer, anticipated_drop_mem);
    else
      logical_scalar_shift_buffer = new int_logical_scalar_shift_buffer<Torus>(
          streams, gpu_indexes, gpu_count, LEFT_SHIFT, params, num_radix_blocks,
-          allocate_gpu_memory, &anticipated_drop_mem);
+          allocate_gpu_memory, anticipated_drop_mem);

    uint64_t last_step_mem = 0;
    if (num_ciphertext_bits > 0) {
      sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-          num_ciphertext_bits, true, allocate_gpu_memory, &last_step_mem);
+          num_ciphertext_bits, true, allocate_gpu_memory, last_step_mem);
    }
    uint32_t uses_carry = 0;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
    sc_prop_mem = new int_sc_prop_memory<Torus>(
        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-        requested_flag, uses_carry, allocate_gpu_memory, &last_step_mem);
+        requested_flag, uses_carry, allocate_gpu_memory, last_step_mem);
    if (anticipated_buffer_drop) {
-      *size_tracker += std::max(anticipated_drop_mem, last_step_mem);
+      size_tracker += std::max(anticipated_drop_mem, last_step_mem);
    } else {
-      *size_tracker += anticipated_drop_mem + last_step_mem;
+      size_tracker += anticipated_drop_mem + last_step_mem;
    }
  }

@@ -4898,7 +4909,7 @@ template <typename Torus> struct int_abs_buffer {
  int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                 uint32_t gpu_count, int_radix_params params,
                 uint32_t num_radix_blocks, bool allocate_gpu_memory,
-                 uint64_t *size_tracker) {
+                 uint64_t &size_tracker) {
    this->params = params;
    this->allocate_gpu_memory = allocate_gpu_memory;
    arithmetic_scalar_shift_mem = new int_arithmetic_scalar_shift_buffer<Torus>(
@@ -4968,7 +4979,7 @@ template <typename Torus> struct int_div_rem_memory {
  int_div_rem_memory(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                     uint32_t gpu_count, int_radix_params params,
                     bool is_signed, uint32_t num_blocks,
-                     bool allocate_gpu_memory, uint64_t *size_tracker) {
+                     bool allocate_gpu_memory, uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
    this->active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
@@ -5128,7 +5139,7 @@ template <typename Torus> struct int_div_rem_memory {
  }
 };

-template <typename Torus> struct int_scalar_mul_high {
+template <typename Torus> struct int_scalar_mul_high_buffer {
  int_radix_params params;
  bool allocate_gpu_memory;

@@ -5137,17 +5148,20 @@ template <typename Torus> struct int_scalar_mul_high {

  CudaRadixCiphertextFFI *tmp;

-  int_scalar_mul_high(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                      uint32_t gpu_count, const int_radix_params params,
-                      uint32_t num_radix_blocks, const bool allocate_gpu_memory,
-                      SHIFT_OR_ROTATE_TYPE shift_type, uint32_t num_scalar_bits,
-                      bool anticipated_buffer_drop, uint64_t *size_tracker) {
+  int_scalar_mul_high_buffer(cudaStream_t const *streams,
+                             uint32_t const *gpu_indexes, uint32_t gpu_count,
+                             const int_radix_params params,
+                             uint32_t num_radix_blocks,
+                             const bool allocate_gpu_memory,
+                             uint32_t num_scalar_bits,
+                             bool anticipated_buffer_drop,
+                             uint64_t &size_tracker) {

    this->params = params;
    this->allocate_gpu_memory = allocate_gpu_memory;

    this->logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
-        streams, gpu_indexes, gpu_count, shift_type, params,
+        streams, gpu_indexes, gpu_count, RIGHT_SHIFT, params,
        2 * num_radix_blocks, allocate_gpu_memory, size_tracker);

    this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
@@ -5188,7 +5202,7 @@ template <typename Torus> struct int_sub_and_propagate {
                        uint32_t const *gpu_indexes, uint32_t gpu_count,
                        const int_radix_params params,
                        uint32_t num_radix_blocks, uint32_t requested_flag_in,
-                        bool allocate_gpu_memory, uint64_t *size_tracker) {
+                        bool allocate_gpu_memory, uint64_t &size_tracker) {

    this->params = params;
    this->allocate_gpu_memory = allocate_gpu_memory;
@@ -5229,7 +5243,7 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, const int_radix_params params,
      uint32_t num_radix_blocks, uint32_t num_additional_blocks,
-      const bool allocate_gpu_memory, uint64_t *size_tracker) {
+      const bool allocate_gpu_memory, uint64_t &size_tracker) {

    this->params = params;
    this->allocate_gpu_memory = allocate_gpu_memory;
@@ -5298,7 +5312,7 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
  CudaRadixCiphertextFFI *tmp_ffi;

  int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
-  int_scalar_mul_high<Torus> *scalar_mul_high_mem;
+  int_scalar_mul_high_buffer<Torus> *scalar_mul_high_mem;
  int_sc_prop_memory<Torus> *scp_mem;
  int_sub_and_propagate<Torus> *sub_and_propagate_mem;

@@ -5308,7 +5322,7 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
      uint32_t num_radix_blocks, const bool allocate_gpu_memory,
      bool is_divisor_power_of_two, bool log2_divisor_exceeds_threshold,
      bool multiplier_exceeds_threshold, uint32_t ilog2_divisor,
-      uint32_t num_scalar_bits, uint64_t *size_tracker) {
+      uint32_t num_scalar_bits, uint64_t &size_tracker) {

    this->params = params;
    this->allocate_gpu_memory = allocate_gpu_memory;
@@ -5339,10 +5353,9 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
        logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
            streams, gpu_indexes, gpu_count, RIGHT_SHIFT, params,
            num_radix_blocks, allocate_gpu_memory, size_tracker);
-        scalar_mul_high_mem = new int_scalar_mul_high<Torus>(
+        scalar_mul_high_mem = new int_scalar_mul_high_buffer<Torus>(
            streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-            allocate_gpu_memory, RIGHT_SHIFT, num_scalar_bits, true,
-            size_tracker);
+            allocate_gpu_memory, num_scalar_bits, true, size_tracker);
        scp_mem = new int_sc_prop_memory<Torus>(
            streams, gpu_indexes, gpu_count, params, num_radix_blocks,
            FLAG_NONE, (uint32_t)0, allocate_gpu_memory, size_tracker);
@@ -5359,10 +5372,9 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
        logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
            streams, gpu_indexes, gpu_count, RIGHT_SHIFT, params,
            num_radix_blocks, allocate_gpu_memory, size_tracker);
-        scalar_mul_high_mem = new int_scalar_mul_high<Torus>(
+        scalar_mul_high_mem = new int_scalar_mul_high_buffer<Torus>(
            streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-            allocate_gpu_memory, RIGHT_SHIFT, num_scalar_bits, true,
-            size_tracker);
+            allocate_gpu_memory, num_scalar_bits, true, size_tracker);
      }
    }
  }
@@ -5394,6 +5406,360 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
  }
 };

+template <typename Torus> struct int_signed_scalar_mul_high_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
+  int_scalar_mul_buffer<Torus> *scalar_mul_mem;
+  int_extend_radix_with_sign_msb_buffer<Torus> *extend_radix_mem;
+
+  CudaRadixCiphertextFFI *tmp;
+
+  int_signed_scalar_mul_high_buffer(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, const int_radix_params params,
+      uint32_t num_radix_blocks, const bool allocate_gpu_memory,
+      uint32_t num_scalar_bits, uint64_t &size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, RIGHT_SHIFT, params,
+        2 * num_radix_blocks, allocate_gpu_memory, size_tracker);
+
+    this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, params, 2 * num_radix_blocks,
+        num_scalar_bits, allocate_gpu_memory, true, size_tracker);
+
+    this->tmp = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], tmp, 2 * num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->extend_radix_mem = new int_extend_radix_with_sign_msb_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        num_radix_blocks, allocate_gpu_memory, size_tracker);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    logical_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
+    delete logical_scalar_shift_mem;
+
+    scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
+    delete scalar_mul_mem;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp,
+                                   allocate_gpu_memory);
+    delete tmp;
+
+    extend_radix_mem->release(streams, gpu_indexes, gpu_count);
+    delete extend_radix_mem;
+  }
+};
+
+template <typename Torus> struct int_signed_scalar_div_mem {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  CudaRadixCiphertextFFI *tmp_ffi;
+  CudaRadixCiphertextFFI *xsign_ffi;
+
+  int_arithmetic_scalar_shift_buffer<Torus> *arithmetic_scalar_shift_mem;
+  int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
+  int_signed_scalar_mul_high_buffer<Torus> *scalar_mul_high_mem;
+  int_sc_prop_memory<Torus> *scp_mem;
+  int_sub_and_propagate<Torus> *sub_and_propagate_mem;
+
+  int_signed_scalar_div_mem(cudaStream_t const *streams,
+                            uint32_t const *gpu_indexes, uint32_t gpu_count,
+                            const int_radix_params params,
+                            uint32_t num_radix_blocks, uint32_t num_scalar_bits,
+                            const bool allocate_gpu_memory,
+                            bool is_absolute_divisor_one,
+                            bool is_divisor_negative, bool l_exceed_threshold,
+                            bool is_power_of_two, bool multiplier_is_small,
+                            uint64_t &size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->tmp_ffi = nullptr;
+    this->xsign_ffi = nullptr;
+    this->arithmetic_scalar_shift_mem = nullptr;
+    this->logical_scalar_shift_mem = nullptr;
+    this->scalar_mul_high_mem = nullptr;
+    this->scp_mem = nullptr;
+    this->sub_and_propagate_mem = nullptr;
+
+    if (!l_exceed_threshold) {
+
+      if (is_absolute_divisor_one && is_divisor_negative) {
+        tmp_ffi = new CudaRadixCiphertextFFI;
+
+        create_zero_radix_ciphertext_async<Torus>(
+            streams[0], gpu_indexes[0], tmp_ffi, num_radix_blocks,
+            params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+      } else if (!is_absolute_divisor_one) {
+
+        tmp_ffi = new CudaRadixCiphertextFFI;
+        create_zero_radix_ciphertext_async<Torus>(
+            streams[0], gpu_indexes[0], tmp_ffi, num_radix_blocks,
+            params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+        arithmetic_scalar_shift_mem =
+            new int_arithmetic_scalar_shift_buffer<Torus>(
+                streams, gpu_indexes, gpu_count, RIGHT_SHIFT, params,
+                num_radix_blocks, allocate_gpu_memory, size_tracker);
+
+        if (is_power_of_two) {
+
+          logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
+              streams, gpu_indexes, gpu_count, RIGHT_SHIFT, params,
+              num_radix_blocks, allocate_gpu_memory, size_tracker);
+          scp_mem = new int_sc_prop_memory<Torus>(
+              streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+              FLAG_NONE, (uint32_t)0, allocate_gpu_memory, size_tracker);
+
+        } else {
+
+          xsign_ffi = new CudaRadixCiphertextFFI;
+          create_zero_radix_ciphertext_async<Torus>(
+              streams[0], gpu_indexes[0], xsign_ffi, num_radix_blocks,
+              params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+          scalar_mul_high_mem = new int_signed_scalar_mul_high_buffer<Torus>(
+              streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+              allocate_gpu_memory, num_scalar_bits, size_tracker);
+
+          sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
+              streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+              FLAG_NONE, allocate_gpu_memory, size_tracker);
+
+          if (!multiplier_is_small) {
+            scp_mem = new int_sc_prop_memory<Torus>(
+                streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+                FLAG_NONE, (uint32_t)0, allocate_gpu_memory, size_tracker);
+          }
+        }
+      }
+    }
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    if (arithmetic_scalar_shift_mem != nullptr) {
+      arithmetic_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
+      delete arithmetic_scalar_shift_mem;
+    }
+    if (logical_scalar_shift_mem != nullptr) {
+      logical_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
+      delete logical_scalar_shift_mem;
+    }
+    if (scalar_mul_high_mem != nullptr) {
+      scalar_mul_high_mem->release(streams, gpu_indexes, gpu_count);
+      delete scalar_mul_high_mem;
+    }
+    if (scp_mem != nullptr) {
+      scp_mem->release(streams, gpu_indexes, gpu_count);
+      delete scp_mem;
+    }
+    if (sub_and_propagate_mem != nullptr) {
+      sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
+      delete sub_and_propagate_mem;
+    }
+    if (tmp_ffi != nullptr) {
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp_ffi,
+                                     allocate_gpu_memory);
+      delete tmp_ffi;
+    }
+    if (xsign_ffi != nullptr) {
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], xsign_ffi,
+                                     allocate_gpu_memory);
+      delete xsign_ffi;
+    }
+  }
+};
+
+template <typename Torus> struct int_unsigned_scalar_div_rem_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  CudaRadixCiphertextFFI *numerator_ct;
+
+  int_unsigned_scalar_div_mem<Torus> *unsigned_div_mem;
+  int_bitop_buffer<Torus> *bitop_mem;
+  int_scalar_mul_buffer<Torus> *scalar_mul_mem;
+  int_sub_and_propagate<Torus> *sub_and_propagate_mem;
+
+  int_unsigned_scalar_div_rem_buffer(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, const int_radix_params params,
+      uint32_t num_radix_blocks, const bool allocate_gpu_memory,
+      const bool anticipated_buffer_drop, uint32_t num_scalar_bits_for_div,
+      uint32_t num_scalar_bits_for_mul, bool is_divisor_power_of_two,
+      bool log2_divisor_exceeds_threshold, bool multiplier_exceeds_threshold,
+      uint32_t ilog2_divisor, uint64_t divisor, uint64_t &size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->bitop_mem = nullptr;
+    this->scalar_mul_mem = nullptr;
+    this->sub_and_propagate_mem = nullptr;
+
+    this->numerator_ct = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], numerator_ct, num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->unsigned_div_mem = new int_unsigned_scalar_div_mem<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        allocate_gpu_memory, is_divisor_power_of_two,
+        log2_divisor_exceeds_threshold, multiplier_exceeds_threshold,
+        ilog2_divisor, num_scalar_bits_for_div, size_tracker);
+
+    if (is_divisor_power_of_two) {
+
+      this->bitop_mem = new int_bitop_buffer<Torus>(
+          streams, gpu_indexes, gpu_count, SCALAR_BITAND, params,
+          num_radix_blocks, allocate_gpu_memory, size_tracker);
+
+    } else {
+
+      if (divisor != (uint64_t)0 && divisor != (uint64_t)1 &&
+          num_radix_blocks != 0) {
+        this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
+            streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+            num_scalar_bits_for_mul, allocate_gpu_memory,
+            anticipated_buffer_drop, size_tracker);
+      }
+
+      this->sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_radix_blocks, FLAG_NONE,
+          allocate_gpu_memory, size_tracker);
+    }
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], numerator_ct,
+                                   allocate_gpu_memory);
+    delete numerator_ct;
+
+    unsigned_div_mem->release(streams, gpu_indexes, gpu_count);
+    delete unsigned_div_mem;
+
+    if (bitop_mem != nullptr) {
+      bitop_mem->release(streams, gpu_indexes, gpu_count);
+      delete bitop_mem;
+    }
+    if (scalar_mul_mem != nullptr) {
+      scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
+      delete scalar_mul_mem;
+    }
+    if (sub_and_propagate_mem != nullptr) {
+      sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
+      delete sub_and_propagate_mem;
+    }
+  }
+};
+
+template <typename Torus> struct int_signed_scalar_div_rem_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  CudaRadixCiphertextFFI *numerator_ct;
+
+  int_signed_scalar_div_mem<Torus> *signed_div_mem;
+  int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
+  int_scalar_mul_buffer<Torus> *scalar_mul_mem;
+  int_sub_and_propagate<Torus> *sub_and_propagate_mem;
+  int_sc_prop_memory<Torus> *scp_mem;
+
+  int_signed_scalar_div_rem_buffer(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, const int_radix_params params,
+      uint32_t num_radix_blocks, const bool allocate_gpu_memory,
+      const bool anticipated_buffer_drop, uint32_t num_scalar_bits_for_div,
+      uint32_t num_scalar_bits_for_mul, bool is_absolute_divisor_one,
+      bool is_divisor_negative, bool l_exceed_threshold,
+      bool is_absolute_divisor_power_of_two, bool is_divisor_zero,
+      bool multiplier_is_small, uint64_t &size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->scalar_mul_mem = nullptr;
+    this->logical_scalar_shift_mem = nullptr;
+
+    this->numerator_ct = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], numerator_ct, num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->signed_div_mem = new int_signed_scalar_div_mem<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        num_scalar_bits_for_div, allocate_gpu_memory, is_absolute_divisor_one,
+        is_divisor_negative, l_exceed_threshold,
+        is_absolute_divisor_power_of_two, multiplier_is_small, size_tracker);
+
+    this->scp_mem = new int_sc_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks, FLAG_NONE,
+        (uint32_t)0, allocate_gpu_memory, size_tracker);
+
+    bool is_divisor_one = is_absolute_divisor_one && !is_divisor_negative;
+
+    if (!is_divisor_negative && is_absolute_divisor_power_of_two) {
+      this->logical_scalar_shift_mem =
+          new int_logical_scalar_shift_buffer<Torus>(
+              streams, gpu_indexes, gpu_count, LEFT_SHIFT, params,
+              num_radix_blocks, allocate_gpu_memory, size_tracker);
+
+    } else if (!is_divisor_zero && !is_divisor_one && num_radix_blocks != 0) {
+      this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+          num_scalar_bits_for_mul, allocate_gpu_memory, anticipated_buffer_drop,
+          size_tracker);
+    }
+
+    this->sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks, FLAG_NONE,
+        allocate_gpu_memory, size_tracker);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], numerator_ct,
+                                   allocate_gpu_memory);
+    delete numerator_ct;
+
+    signed_div_mem->release(streams, gpu_indexes, gpu_count);
+    delete signed_div_mem;
+
+    scp_mem->release(streams, gpu_indexes, gpu_count);
+    delete scp_mem;
+
+    if (logical_scalar_shift_mem != nullptr) {
+      logical_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
+      delete logical_scalar_shift_mem;
+    }
+    if (scalar_mul_mem != nullptr) {
+      scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
+      delete scalar_mul_mem;
+    }
+    sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
+    delete sub_and_propagate_mem;
+  }
+};
+
 void update_degrees_after_bitand(uint64_t *output_degrees,
                                 uint64_t *lwe_array_1_degrees,
                                 uint64_t *lwe_array_2_degrees,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
+    uint32_t polynomial_size);
+template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
    uint32_t polynomial_size);
 template <typename Torus>
@@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(

 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size);
-
+                            uint32_t polynomial_size,
+                            uint64_t full_sm_keybundle);
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
+                                uint32_t polynomial_size,
+                                uint64_t full_sm_keybundle);
 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
@@ -115,7 +122,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             uint64_t *size_tracker) {
+             uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);

@@ -281,4 +288,146 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  }
 };

+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
+  int8_t *d_mem_keybundle = NULL;
+  int8_t *d_mem_acc_step_one = NULL;
+  int8_t *d_mem_acc_step_two = NULL;
+  int8_t *d_mem_acc_cg = NULL;
+  int8_t *d_mem_acc_tbc = NULL;
+  uint32_t lwe_chunk_size;
+  double *keybundle_fft;
+  __uint128_t *global_accumulator;
+  double *global_join_buffer;
+
+  PBS_VARIANT pbs_variant;
+  bool gpu_memory_allocated;
+
+  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+                 uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
+                 bool allocate_gpu_memory, uint64_t &size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    cuda_set_device(gpu_index);
+
+    this->pbs_variant = pbs_variant;
+    this->lwe_chunk_size = lwe_chunk_size;
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+    // default
+    uint64_t full_sm_keybundle =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_one =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_two =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
+            __uint128_t>(polynomial_size);
+    uint64_t partial_sm_accumulate_step_one =
+        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    // cg
+    uint64_t full_sm_cg_accumulate =
+        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
+            polynomial_size);
+    uint64_t partial_sm_cg_accumulate =
+        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
+            __uint128_t>(polynomial_size);
+
+    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
+                                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                level_count;
+    auto num_blocks_acc_step_one =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+    auto num_blocks_acc_step_two =
+        input_lwe_ciphertext_count * (glwe_dimension + 1);
+    auto num_blocks_acc_cg =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+
+    // Keybundle
+    if (max_shared_memory < full_sm_keybundle)
+      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
+          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
+          size_tracker, allocate_gpu_memory);
+
+    switch (pbs_variant) {
+    case PBS_VARIANT::CG:
+      // Accumulator CG
+      if (max_shared_memory < partial_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      break;
+    case PBS_VARIANT::DEFAULT:
+      // Accumulator step one
+      if (max_shared_memory < partial_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+
+      // Accumulator step two
+      if (max_shared_memory < full_sm_accumulate_step_two)
+        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
+        num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
+        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+            sizeof(__uint128_t),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+            (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+
+    if (d_mem_keybundle)
+      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
+                                         gpu_memory_allocated);
+    switch (pbs_variant) {
+    case DEFAULT:
+      if (d_mem_acc_step_one)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
+                                           gpu_index, gpu_memory_allocated);
+      if (d_mem_acc_step_two)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
+                                           gpu_index, gpu_memory_allocated);
+      break;
+    case CG:
+      if (d_mem_acc_cg)
+        cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
+                                           gpu_memory_allocated);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
+                                       gpu_memory_allocated);
+  }
+};
+
 #endif // CUDA_MULTI_BIT_UTILITIES_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -90,7 +90,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
             uint32_t glwe_dimension, uint32_t polynomial_size,
             uint32_t level_count, uint32_t input_lwe_ciphertext_count,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             bool allocate_ms_array, uint64_t *size_tracker) {
+             bool allocate_ms_array, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->uses_noise_reduction = allocate_ms_array;
@@ -240,7 +240,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

-template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
+template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;
+
+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
@@ -257,7 +260,7 @@ template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
                 uint32_t polynomial_size, uint32_t level_count,
                 uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
                 bool allocate_gpu_memory, bool allocate_ms_array,
-                 uint64_t *size_tracker) {
+                 uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
+                                                       const uint32_t gpu_index,
+                                                       int8_t **buffer);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -27,7 +27,7 @@ template <typename Torus> struct zk_expand_mem {
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
                const bool *is_boolean_array, uint32_t num_compact_lists,
-                bool allocate_gpu_memory, uint64_t *size_tracker)
+                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
        casting_key_type(casting_key_type) {
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES})
+add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
+                                     pbs/programmable_bootstrap_multibit_128.cu)
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -189,7 +189,7 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(
  uint64_t size_tracker = 0;
  uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
  *fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
-      buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
+      buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
  r = __ll2double_rn(x);
 }

+template <>
+__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
+                                                             double &r) {
+  // We truncate x
+  r = __ll2double_rn(static_cast<uint64_t>(x));
+}
+
 template <typename T>
 __device__ inline T init_decomposer_state(T input, uint32_t base_log,
                                          uint32_t level_count) {
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -74,10 +74,9 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t *size_tracker,
+                                           uint64_t &size_tracker,
                                           bool allocate_gpu_memory) {
-  if (size_tracker != nullptr)
-    *size_tracker += size;
+  size_tracker += size;
  void *ptr = nullptr;
  if (!allocate_gpu_memory)
    return ptr;
@@ -106,8 +105,9 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                        uint32_t gpu_index) {
-  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index, nullptr,
-                                              true);
+  uint64_t size_tracker = 0;
+  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
+                                              size_tracker, true);
 }

 /// Check that allocation is valid
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus(
  }
 }

+// params is expected to be full degree not half degree
+// same as convert_u128_to_f128_as_torus() but expects input to be on registers
+template <class params>
+__device__ void convert_u128_on_regs_to_f128_as_torus(
+    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
+    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
+
+  const double normalization = pow(2., -128.);
+  Index tid = threadIdx.x;
+  // #pragma unroll
+  for (Index i = 0; i < params::opt / 2; i++) {
+    auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
+    auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
+
+    out_re_hi[tid] = out_re.hi * normalization;
+    out_re_lo[tid] = out_re.lo * normalization;
+    out_im_hi[tid] = out_im.hi * normalization;
+    out_im_lo[tid] = out_im.lo * normalization;
+
+    tid += params::degree / params::opt;
+  }
+}
+
 template <class params>
 __device__ void
 convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
 // ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n,  im_lo_0, im_lo_1, ..., im_lo_n]
 template <class params>
@@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
 // re_lo_1, im_hi_1, im_lo_1,
 // ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
  if (is_signed) {
    *mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
                                         params, num_blocks,
-                                         allocate_gpu_memory, &size_tracker);
+                                         allocate_gpu_memory, size_tracker);
  }
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -61,7 +61,7 @@ __host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
                                         params, num_radix_blocks,
-                                         allocate_gpu_memory, &size_tracker);
+                                         allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -44,7 +44,7 @@ __host__ uint64_t scratch_extend_radix_with_sign_msb(

  *mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      num_additional_blocks, allocate_gpu_memory, &size_tracker);
+      num_additional_blocks, allocate_gpu_memory, size_tracker);

  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -100,7 +100,7 @@ __host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_cmux_buffer<Torus>(
      streams, gpu_indexes, gpu_count, predicate_lut_f, params,
-      num_radix_blocks, allocate_gpu_memory, &size_tracker);
+      num_radix_blocks, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -40,6 +40,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
                                    uint32_t num_radix_blocks) {

  cuda_set_device(gpu_index);
+  printf("GPU %d\n", gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -218,6 +219,8 @@ __host__ void is_at_least_one_comparisons_block_true(
  while (remaining_blocks > 0) {
    // Split in max_value chunks
    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    cudaDeviceSynchronize();
+    printf("Is at least one comparison block true chunks %d\n", num_chunks);

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
@@ -228,6 +231,10 @@ __host__ void is_at_least_one_comparisons_block_true(
    for (int i = 0; i < num_chunks; i++) {
      uint32_t chunk_length =
          std::min(max_value, begin_remaining_blocks - i * max_value);
+      cudaDeviceSynchronize();
+      printf("chunk length %d, accumulator blocks: %d, input blocks: %d\n", chunk_length,
+             buffer->tmp_block_accumulated->num_radix_blocks,
+             mem_ptr->tmp_lwe_array_out->num_radix_blocks);
      chunk_lengths[i] = chunk_length;
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
                                   input_blocks, big_lwe_dimension,
@@ -243,6 +250,8 @@ __host__ void is_at_least_one_comparisons_block_true(

    // Applies the LUT
    if (remaining_blocks == 1) {
+        cudaDeviceSynchronize();
+        printf("Last lut\n");
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -250,6 +259,8 @@ __host__ void is_at_least_one_comparisons_block_true(
          lut, 1);
      return;
    } else {
+        cudaDeviceSynchronize();
+        printf("lut with %d blocks\n", num_chunks);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
          buffer->tmp_block_accumulated, bsks, ksks, ms_noise_reduction_key,
@@ -296,6 +307,8 @@ __host__ void host_compare_blocks_with_zero(
  // Accumulator
  auto sum = lwe_array_out;

+  cudaDeviceSynchronize();
+  printf("Here in compare blocks with zero\n");
  if (num_radix_blocks == 1) {
    // Just copy
    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], sum, 0,
@@ -305,10 +318,16 @@ __host__ void host_compare_blocks_with_zero(
    uint32_t remainder_blocks = num_radix_blocks;
    auto sum_i = (Torus *)sum->ptr;
    auto chunk = (Torus *)lwe_array_in->ptr;
+    int blocks_check = sum->num_radix_blocks;
+    cudaDeviceSynchronize();
+    printf("Here in compare blocks with zero sum %d input %d\n", sum->num_radix_blocks, lwe_array_in->num_radix_blocks);
    while (remainder_blocks > 1) {
+        cudaDeviceSynchronize();
+        printf("Here in compare blocks with zero remainder blocks %d\n", remainder_blocks);
      uint32_t chunk_size =
          std::min(remainder_blocks, num_elements_to_fill_carry);

+      printf("Chunk size: %d, sum_i blocks: %d, remainder blocks: %d\n", chunk_size, blocks_check, remainder_blocks);
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
                                   big_lwe_dimension, chunk_size);

@@ -318,8 +337,11 @@ __host__ void host_compare_blocks_with_zero(
      // Update operands
      chunk += (chunk_size - 1) * big_lwe_size;
      sum_i += big_lwe_size;
+      blocks_check -= 1;
    }
  }
+    cudaDeviceSynchronize();
+    printf("Here in compare blocks with zero num sum blocks: %d\n", num_sum_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, sum, bsks, ksks,
@@ -684,7 +706,7 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_comparison_buffer<Torus>(
      streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -370,7 +370,7 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_compression<Torus>(
      streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
-      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, &size_tracker);
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -386,7 +386,7 @@ __host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
  *mem_ptr = new int_decompression<Torus>(
      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
      num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
-      &size_tracker);
+      size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -28,7 +28,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
                                           params, is_signed, num_blocks,
-                                           allocate_gpu_memory, &size_tracker);
+                                           allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -80,6 +80,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
  set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
                                               quotient, 0, num_blocks);

+  cudaDeviceSynchronize();
+  printf("Here 0\n");
+
  for (int i = total_bits - 1; i >= 0; i--) {
    uint32_t pos_in_block = i % num_bits_in_message;
    uint32_t msb_bit_set = total_bits - 1 - i;
@@ -89,6 +92,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // and all blocks after it are also trivial zeros
    // This number is in range 1..=num_bocks -1
    uint32_t first_trivial_block = last_non_trivial_block + 1;
+    printf("num blocks: %d, first trivial block: %d\n", num_blocks, first_trivial_block);
    reset_radix_ciphertext_blocks(interesting_remainder1, first_trivial_block);
    reset_radix_ciphertext_blocks(interesting_remainder2, first_trivial_block);
    reset_radix_ciphertext_blocks(interesting_divisor, first_trivial_block);
@@ -243,18 +247,28 @@ __host__ void host_unsigned_integer_div_rem_kb(
    for (uint j = 0; j < gpu_count; j++) {
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
+      cudaDeviceSynchronize();
+      printf("Here 1\n");
    // interesting_divisor
    trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
                                       gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 2\n");
    // divisor_ms_blocks
    trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
    // interesting_remainder1
    // numerator_block_stack
+      cudaDeviceSynchronize();
+      printf("Here 3\n");
    left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes,
                                      gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 4\n");
    // interesting_remainder2
    left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
                                      gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 5\n");
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -318,6 +332,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
          subtraction_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
          mem_ptr->overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
          compute_borrow, uses_input_borrow);
+      for (uint i = 0; i < gpu_count; i++) {
+          cuda_set_device(gpu_indexes[i]);
+          cudaDeviceSynchronize();
+          printf("Synchronize gpu %d\n", i);
+          check_cuda_error(cudaGetLastError());
+      }
    };

    // fills:
@@ -326,6 +346,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
                                          uint32_t const *gpu_indexes,
                                          uint32_t gpu_count) {
      auto trivial_blocks = divisor_ms_blocks;
+      printf("Trivial blocks: %d\n", trivial_blocks->num_radix_blocks);
+        for (uint i = 0; i < gpu_count; i++) {
+            cuda_set_device(gpu_indexes[i]);
+            cudaDeviceSynchronize();
+            printf("Synchronize gpu %d\n", i);
+            check_cuda_error(cudaGetLastError());
+        }
      if (trivial_blocks->num_radix_blocks == 0) {
        set_zero_radix_ciphertext_slice_async<Torus>(
            streams[0], gpu_indexes[0], at_least_one_upper_block_is_non_zero, 0,
@@ -341,6 +368,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
            trivial_blocks->num_radix_blocks,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);

+        cudaDeviceSynchronize();
+          printf("Before is at least one comparisons block true %d\n", mem_ptr->tmp_1->num_radix_blocks);
        is_at_least_one_comparisons_block_true<Torus>(
            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
@@ -370,12 +399,20 @@ __host__ void host_unsigned_integer_div_rem_kb(
    }
    // new_remainder
    // subtraction_overflowed
+      cudaDeviceSynchronize();
+      printf("Here 6 before overlfow sub\n");
    do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
    // at_least_one_upper_block_is_non_zero
+      cudaDeviceSynchronize();
+      printf("Here 7\n");
    check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
    // cleaned_merged_interesting_remainder
+      cudaDeviceSynchronize();
+      printf("Here 8\n");
    create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3,
                                             gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 9\n");
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -441,13 +478,21 @@ __host__ void host_unsigned_integer_div_rem_kb(
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }
    // cleaned_merged_interesting_remainder
+      cudaDeviceSynchronize();
+      printf("Here 10\n");
    conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1,
                                                        gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 11\n");
    // new_remainder
    conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2,
                                                gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 12\n");
    // quotient
    set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
+      cudaDeviceSynchronize();
+      printf("Here 13\n");
    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -482,10 +527,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
+    cudaDeviceSynchronize();
+    printf("Here 14\n");
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
      num_blocks);
+    cudaDeviceSynchronize();
+    printf("Here 15\n");
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -398,7 +398,7 @@ uint64_t scratch_cuda_apply_noise_squashing_mem(
  *mem_ptr = new int_noise_squashing_lut<uint64_t>(
      (cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
      polynomial_size, num_radix_blocks, original_num_blocks,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -536,6 +536,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+  printf("Active GPUs in lut univ: %d, gpu count: %d\n", active_gpu_count, gpu_count);
  if (active_gpu_count == 1) {
    execute_keyswitch_async<Torus>(
        streams, gpu_indexes, 1, lwe_after_ks_vec[0],
@@ -1472,7 +1473,7 @@ uint64_t scratch_cuda_full_propagation(cudaStream_t const *streams,
  uint64_t size_tracker = 0;
  *mem_ptr =
      new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
-                                     allocate_gpu_memory, &size_tracker);
+                                     allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -1707,7 +1708,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory,
-                                      &size_tracker);
+                                      size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1743,7 +1744,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, num_many_lut,
-                                      allocate_gpu_memory, &size_tracker);
+                                      allocate_gpu_memory, size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1779,7 +1780,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
                                      1, num_radix_blocks, allocate_gpu_memory,
-                                      &size_tracker);
+                                      size_tracker);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
  cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1817,7 +1818,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_sc_prop_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
-      uses_carry, allocate_gpu_memory, &size_tracker);
+      uses_carry, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }
 // This function perform the three steps of Thomas' new carry propagation
@@ -2080,7 +2081,7 @@ uint64_t scratch_cuda_integer_overflowing_sub(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_borrow_prop_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      compute_overflow, allocate_gpu_memory, &size_tracker);
+      compute_overflow, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -2114,12 +2115,24 @@ void host_single_borrow_propagate(
        streams[0], gpu_indexes[0], lwe_array, lwe_array, input_borrow, 1,
        message_modulus, carry_modulus);
  }
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  // Step 1
  host_compute_shifted_blocks_and_borrow_states<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array,
      mem->shifted_blocks_borrow_state_mem, bsks, ksks, ms_noise_reduction_key,
      lut_stride, num_many_lut);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
  copy_radix_ciphertext_slice_async<Torus>(
      streams[0], gpu_indexes[0], mem->overflow_block, 0, 1, borrow_states,
@@ -2131,6 +2144,12 @@ void host_single_borrow_propagate(
      mem->prop_simu_group_carries_mem, bsks, ksks, ms_noise_reduction_key,
      num_radix_blocks, num_groups);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  auto shifted_blocks =
      (Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
  auto prepared_blocks = mem->prop_simu_group_carries_mem->prepared_blocks;
@@ -2140,10 +2159,22 @@ void host_single_borrow_propagate(
                          (Torus *)prepared_blocks->ptr, shifted_blocks,
                          simulators, big_lwe_dimension, num_radix_blocks);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  host_integer_radix_add_scalar_one_inplace<Torus>(
      streams, gpu_indexes, gpu_count, prepared_blocks, message_modulus,
      carry_modulus);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    CudaRadixCiphertextFFI shifted_simulators;
    as_radix_ciphertext_slice<Torus>(
@@ -2152,6 +2183,12 @@ void host_single_borrow_propagate(
    host_addition<Torus>(streams[0], gpu_indexes[0], mem->overflow_block,
                         mem->overflow_block, &shifted_simulators, 1);
  }
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  CudaRadixCiphertextFFI resolved_borrows;
  as_radix_ciphertext_slice<Torus>(
      &resolved_borrows, mem->prop_simu_group_carries_mem->resolved_carries,
@@ -2165,49 +2202,60 @@ void host_single_borrow_propagate(
                         mem->overflow_block, &resolved_borrows, 1);
  }

-  cuda_event_record(mem->incoming_events[0], streams[0], gpu_indexes[0]);
-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_stream_wait_event(mem->sub_streams_1[j], mem->incoming_events[0],
-                           gpu_indexes[j]);
-    cuda_stream_wait_event(mem->sub_streams_2[j], mem->incoming_events[0],
-                           gpu_indexes[j]);
-  }
-
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    auto borrow_flag = mem->lut_borrow_flag;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block,
+        streams, gpu_indexes, gpu_count, overflow_block,
        mem->overflow_block, bsks, ksks, ms_noise_reduction_key, borrow_flag,
        1);
  }
-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1[j],
-                      gpu_indexes[j]);
-  }

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  // subtract borrow and cleanup prepared blocks
  auto resolved_carries = mem->prop_simu_group_carries_mem->resolved_carries;
  host_negation<Torus>(
-      mem->sub_streams_2[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
+      streams[0], gpu_indexes[0], (Torus *)resolved_carries->ptr,
      (Torus *)resolved_carries->ptr, big_lwe_dimension, num_groups);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  host_radix_sum_in_groups<Torus>(
-      mem->sub_streams_2[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
      resolved_carries, num_radix_blocks, mem->group_size);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  auto message_extract = mem->lut_message_extract;
+  printf("lut blocks: %d, call with %d\n", message_extract->num_blocks, num_radix_blocks);
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      mem->sub_streams_2, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
+      streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
      bsks, ksks, ms_noise_reduction_key, message_extract, num_radix_blocks);
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }

-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2[j],
-                      gpu_indexes[j]);
-    cuda_stream_wait_event(streams[0], mem->outgoing_events1[j],
-                           gpu_indexes[0]);
-    cuda_stream_wait_event(streams[0], mem->outgoing_events2[j],
-                           gpu_indexes[0]);
-  }
 }

 /// num_radix_blocks corresponds to the number of blocks on which to apply the
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -236,58 +236,11 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
-  switch (mem->params.polynomial_size) {
-  case 512:
-    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
-    break;
-  case 1024:
-    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
-                                                AmortizedDegree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
-    break;
-  case 2048:
-    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
-                                                AmortizedDegree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
-    break;
-  case 4096:
-    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
-                                                AmortizedDegree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
-    break;
-  case 8192:
-    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
-                                                AmortizedDegree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
-    break;
-  case 16384:
-    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
-                                                AmortizedDegree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
-        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
-    break;
-  default:
-    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
-          "Supported N's are powers of two in the interval [256..16384].")
-  }
+  host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
+      radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+      radix_lwe_out->num_radix_blocks,
+      radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
 }

 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -284,11 +284,11 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
  *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
      max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

-template <typename Torus, class params>
+template <typename Torus>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
@@ -367,7 +367,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  bool needs_processing = false;
  radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
                                num_radix_in_vec, chunk_size, needs_processing);
-  int number_of_threads = std::min(256, params::degree);
+  int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

@@ -463,9 +463,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
            d_pbs_indexes_in, d_pbs_indexes_out,
            luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);

-    cuda_memset_async(
-        (Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
-        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+    set_zero_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], current_blocks, num_radix_blocks,
+        num_radix_blocks + 1);

    auto active_gpu_count =
        get_active_gpu_count(2 * num_radix_blocks, gpu_count);
@@ -666,7 +666,7 @@ __host__ void host_integer_mult_radix_kb(
    size_t b_id = i % num_blocks;
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }
-  host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
+  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
      ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
      2 * num_blocks);
@@ -690,7 +690,7 @@ __host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_mul_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, is_boolean_left,
-      is_boolean_right, num_radix_blocks, allocate_gpu_memory, &size_tracker);
+      is_boolean_right, num_radix_blocks, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -121,7 +121,7 @@ __host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_overflowing_sub_memory<Torus>(
      streams, gpu_indexes, gpu_count, params, num_blocks, allocate_gpu_memory,
-      allocate_ms_array, &size_tracker);
+      allocate_ms_array, size_tracker);
  POP_RANGE()
  return size_tracker;
 }
@@ -165,10 +165,22 @@ __host__ void host_integer_overflowing_sub(
      stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
      radix_params.message_modulus, radix_params.carry_modulus);

+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  host_single_borrow_propagate<Torus>(
      streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
      ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
+    for (uint i = 0; i < gpu_count; i++) {
+        cuda_set_device(gpu_indexes[i]);
+        cudaDeviceSynchronize();
+        printf("Synchronize gpu %d\n", i);
+        check_cuda_error(cudaGetLastError());
+    }
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -13,7 +13,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                        CudaRadixCiphertextFFI *radix,
                                        const uint32_t num_radix_blocks,
                                        const uint32_t lwe_dimension,
-                                        uint64_t *size_tracker,
+                                        uint64_t &size_tracker,
                                        bool allocate_gpu_memory) {
  PUSH_RANGE("create zero radix ct");
  radix->lwe_dimension = lwe_dimension;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -277,8 +277,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
        auto overflowed = x_0 < x_1;
        return (Torus)(invert_flags.second ^ overflowed);
      };
+      uint64_t size = 0;
      int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true, size);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
@@ -578,8 +579,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
               is_x_less_than_y_given_input_borrow<Torus>(x_0, x_1, 0,
                                                          message_modulus);
      };
+      uint64_t size = 0;
      int_radix_lut<Torus> *one_block_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, true, nullptr);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true, size);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], one_block_lut->get_lut(0, 0),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
@@ -52,3 +52,195 @@ void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(

  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
 }
+
+uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t num_scalar_bits, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_power_of_two, bool multiplier_is_small,
+    bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_integer_signed_scalar_div_radix_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
+      (int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
+      num_scalar_bits, allocate_gpu_memory, is_absolute_divisor_one,
+      is_divisor_negative, l_exceed_threshold, is_power_of_two,
+      multiplier_is_small);
+}
+
+void cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *ksks,
+    void *const *bsks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_power_of_two, bool multiplier_is_small,
+    uint32_t l, uint32_t shift_post, bool is_rhs_power_of_two, bool is_rhs_zero,
+    bool is_rhs_one, uint32_t rhs_shift, uint32_t numerator_bits,
+    uint32_t num_scalars, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set) {
+
+  host_integer_signed_scalar_div_radix_kb<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count, numerator_ct,
+      (int_signed_scalar_div_mem<uint64_t> *)mem_ptr, (uint64_t **)ksks, bsks,
+      ms_noise_reduction_key, is_absolute_divisor_one, is_divisor_negative,
+      l_exceed_threshold, is_power_of_two, multiplier_is_small, l, shift_post,
+      is_rhs_power_of_two, is_rhs_zero, is_rhs_one, rhs_shift, numerator_bits,
+      num_scalars, decomposed_scalar, has_at_least_one_set);
+}
+
+void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_signed_scalar_div_mem<uint64_t> *mem_ptr =
+      (int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool is_divisor_power_of_two,
+    bool log2_divisor_exceeds_threshold, bool multiplier_exceeds_threshold,
+    uint32_t num_scalar_bits_for_div, uint32_t num_scalar_bits_for_mul,
+    uint32_t ilog2_divisor, uint64_t divisor, bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
+      (int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
+      allocate_gpu_memory, is_divisor_power_of_two,
+      log2_divisor_exceeds_threshold, multiplier_exceeds_threshold,
+      num_scalar_bits_for_div, num_scalar_bits_for_mul, ilog2_divisor, divisor);
+}
+
+void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
+    int8_t *mem_ptr, void *const *ksks, void *const *bsks,
+    uint64_t const *decomposed_scalar_for_div,
+    uint64_t const *decomposed_scalar_for_mul,
+    uint64_t const *has_at_least_one_set_for_div,
+    uint64_t const *has_at_least_one_set_for_mul,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    uint32_t num_scalars_for_div, uint32_t num_scalars_for_mul,
+    bool multiplier_exceeds_threshold, bool is_divisor_power_of_two,
+    bool log2_divisor_exceeds_threshold, uint32_t ilog2_divisor,
+    uint64_t divisor, uint64_t shift_pre, uint32_t shift_post, uint64_t rhs,
+    void const *clear_blocks, void const *h_clear_blocks,
+    uint32_t num_clear_blocks) {
+
+  host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
+      remainder_ct, (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
+      (uint64_t **)ksks, bsks, decomposed_scalar_for_div,
+      decomposed_scalar_for_mul, has_at_least_one_set_for_div,
+      has_at_least_one_set_for_mul, ms_noise_reduction_key, num_scalars_for_div,
+      num_scalars_for_mul, multiplier_exceeds_threshold,
+      is_divisor_power_of_two, log2_divisor_exceeds_threshold, ilog2_divisor,
+      divisor, shift_pre, shift_post, rhs, (uint64_t *)clear_blocks,
+      (uint64_t *)h_clear_blocks, num_clear_blocks);
+}
+
+void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
+      (int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
+
+uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    uint32_t num_scalar_bits_for_div, uint32_t num_scalar_bits_for_mul,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_absolute_divisor_power_of_two,
+    bool is_divisor_zero, bool multiplier_is_small, bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, params,
+      (int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
+      allocate_gpu_memory, num_scalar_bits_for_div, num_scalar_bits_for_mul,
+      is_absolute_divisor_one, is_divisor_negative, l_exceed_threshold,
+      is_absolute_divisor_power_of_two, is_divisor_zero, multiplier_is_small);
+}
+
+void cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
+    int8_t *mem_ptr, void *const *ksks, void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool is_divisor_zero, bool l_exceed_threshold,
+    bool is_absolute_divisor_power_of_two, bool multiplier_is_small, uint32_t l,
+    uint32_t shift_post, bool is_rhs_power_of_two, bool is_rhs_zero,
+    bool is_rhs_one, uint32_t rhs_shift, uint32_t divisor_shift,
+    uint32_t numerator_bits, uint32_t num_scalars_for_div,
+    uint32_t num_scalars_for_mul, uint64_t const *decomposed_scalar_for_div,
+    uint64_t const *decomposed_scalar_for_mul,
+    uint64_t const *has_at_least_one_set_for_div,
+    uint64_t const *has_at_least_one_set_for_mul) {
+
+  host_integer_signed_scalar_div_rem_radix<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count, quotient_ct,
+      remainder_ct, (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr,
+      (uint64_t **)ksks, bsks, ms_noise_reduction_key, is_absolute_divisor_one,
+      is_divisor_negative, is_divisor_zero, l_exceed_threshold,
+      is_absolute_divisor_power_of_two, multiplier_is_small, l, shift_post,
+      is_rhs_power_of_two, is_rhs_zero, is_rhs_one, rhs_shift, divisor_shift,
+      numerator_bits, num_scalars_for_div, num_scalars_for_mul,
+      decomposed_scalar_for_div, decomposed_scalar_for_mul,
+      has_at_least_one_set_for_div, has_at_least_one_set_for_mul);
+}
+
+void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
+      (int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
+
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
@@ -2,6 +2,7 @@
 #define SCALAR_DIV_CUH

 #include "integer/integer_utilities.h"
+#include "integer/scalar_bitops.cuh"
 #include "integer/scalar_mul.cuh"
 #include "integer/scalar_shifts.cuh"
 #include "integer/subtraction.cuh"
@@ -21,7 +22,7 @@ __host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
      allocate_gpu_memory, is_divisor_power_of_two,
      log2_divisor_exceeds_threshold, multiplier_exceeds_threshold,
-      ilog2_divisor, num_scalar_bits, &size_tracker);
+      ilog2_divisor, num_scalar_bits, size_tracker);

  return size_tracker;
 }
@@ -74,27 +75,27 @@ __host__ void host_integer_unsigned_scalar_div_radix(
    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
                                       numerator_cpy, numerator_ct);

-    host_integer_radix_scalar_mul_high_kb(
+    host_integer_radix_scalar_mul_high_kb<Torus>(
        streams, gpu_indexes, gpu_count, numerator_cpy,
        mem_ptr->scalar_mul_high_mem, ksks, rhs, decomposed_scalar,
        has_at_least_one_set, ms_noise_reduction_key, bsks, num_scalars);

-    host_sub_and_propagate_single_carry(
+    host_sub_and_propagate_single_carry<Torus>(
        streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
        nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
        ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);

-    host_integer_radix_logical_scalar_shift_kb_inplace(
+    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
        streams, gpu_indexes, gpu_count, numerator_ct, (uint32_t)1,
        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
        numerator_ct->num_radix_blocks);

-    host_add_and_propagate_single_carry(
+    host_add_and_propagate_single_carry<Torus>(
        streams, gpu_indexes, gpu_count, numerator_ct, numerator_cpy, nullptr,
        nullptr, mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key,
        FLAG_NONE, (uint32_t)0);

-    host_integer_radix_logical_scalar_shift_kb_inplace(
+    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
        streams, gpu_indexes, gpu_count, numerator_ct, shift_post - (uint32_t)1,
        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
        numerator_ct->num_radix_blocks);
@@ -102,20 +103,362 @@ __host__ void host_integer_unsigned_scalar_div_radix(
    return;
  }

-  host_integer_radix_logical_scalar_shift_kb_inplace(
+  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
      streams, gpu_indexes, gpu_count, numerator_ct, shift_pre,
      mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
      numerator_ct->num_radix_blocks);

-  host_integer_radix_scalar_mul_high_kb(
+  host_integer_radix_scalar_mul_high_kb<Torus>(
      streams, gpu_indexes, gpu_count, numerator_ct,
      mem_ptr->scalar_mul_high_mem, ksks, rhs, decomposed_scalar,
      has_at_least_one_set, ms_noise_reduction_key, bsks, num_scalars);

-  host_integer_radix_logical_scalar_shift_kb_inplace(
+  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
      streams, gpu_indexes, gpu_count, numerator_ct, shift_post,
      mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
      numerator_ct->num_radix_blocks);
 }

+template <typename Torus>
+__host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_radix_params params,
+    int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
+    uint32_t num_scalar_bits, const bool allocate_gpu_memory,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_power_of_two, bool multiplier_is_small) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_signed_scalar_div_mem<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      num_scalar_bits, allocate_gpu_memory, is_absolute_divisor_one,
+      is_divisor_negative, l_exceed_threshold, is_power_of_two,
+      multiplier_is_small, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_integer_signed_scalar_div_radix_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *numerator_ct,
+    int_signed_scalar_div_mem<Torus> *mem_ptr, Torus *const *ksks,
+    void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_power_of_two, bool multiplier_is_small,
+    uint32_t l, uint32_t shift_post, bool is_rhs_power_of_two, bool is_rhs_zero,
+    bool is_rhs_one, uint32_t rhs_shift, uint32_t numerator_bits,
+    uint32_t num_scalars, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set) {
+
+  if (is_absolute_divisor_one) {
+    if (is_divisor_negative) {
+      CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
+
+      host_integer_radix_negation<Torus>(
+          streams, gpu_indexes, gpu_count, tmp, numerator_ct,
+          mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
+          numerator_ct->num_radix_blocks);
+
+      copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                         numerator_ct, tmp);
+    }
+    return;
+  }
+
+  if (l_exceed_threshold) {
+    set_zero_radix_ciphertext_slice_async<Torus>(
+        streams[0], gpu_indexes[0], numerator_ct, 0,
+        numerator_ct->num_radix_blocks);
+
+    return;
+  }
+
+  CudaRadixCiphertextFFI *tmp = mem_ptr->tmp_ffi;
+
+  if (is_power_of_two) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
+                                       numerator_ct);
+
+    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, l - 1,
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);
+
+    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, numerator_bits - l,
+        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
+        tmp->num_radix_blocks);
+
+    host_add_and_propagate_single_carry<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
+        mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
+        (uint32_t)0);
+
+    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, l,
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);
+
+  } else if (multiplier_is_small) {
+
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
+                                       numerator_ct);
+
+    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
+        ksks, is_rhs_power_of_two, is_rhs_zero, is_rhs_one, rhs_shift,
+        decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
+        num_scalars);
+
+    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, shift_post,
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);
+
+    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
+                                       numerator_ct);
+
+    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);
+
+    host_sub_and_propagate_single_carry<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
+        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
+        FLAG_NONE, (uint32_t)0);
+
+  } else {
+
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], tmp,
+                                       numerator_ct);
+
+    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, mem_ptr->scalar_mul_high_mem,
+        ksks, is_rhs_power_of_two, is_rhs_zero, is_rhs_one, rhs_shift,
+        decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
+        num_scalars);
+
+    host_add_and_propagate_single_carry<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, numerator_ct, nullptr, nullptr,
+        mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
+        (uint32_t)0);
+
+    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, shift_post,
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);
+
+    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], xsign,
+                                       numerator_ct);
+
+    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+        streams, gpu_indexes, gpu_count, xsign, numerator_bits - 1,
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);
+
+    host_sub_and_propagate_single_carry<Torus>(
+        streams, gpu_indexes, gpu_count, tmp, xsign, nullptr, nullptr,
+        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
+        FLAG_NONE, (uint32_t)0);
+  }
+
+  if (is_divisor_negative) {
+    host_integer_radix_negation<Torus>(
+        streams, gpu_indexes, gpu_count, numerator_ct, tmp,
+        mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus,
+        numerator_ct->num_radix_blocks);
+  } else {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
+                                       tmp);
+  }
+}
+
+template <typename Torus>
+__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, const int_radix_params params,
+    int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, const bool allocate_gpu_memory,
+    bool is_divisor_power_of_two, bool log2_divisor_exceeds_threshold,
+    bool multiplier_exceeds_threshold, uint32_t num_scalar_bits_for_div,
+    uint32_t num_scalar_bits_for_mul, uint32_t ilog2_divisor,
+    uint64_t divisor) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      allocate_gpu_memory, true, num_scalar_bits_for_div,
+      num_scalar_bits_for_mul, is_divisor_power_of_two,
+      log2_divisor_exceeds_threshold, multiplier_exceeds_threshold,
+      ilog2_divisor, divisor, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_integer_unsigned_scalar_div_rem_radix(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
+    CudaRadixCiphertextFFI *remainder_ct,
+    int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    void *const *bsks, uint64_t const *decomposed_scalar_for_div,
+    uint64_t const *decomposed_scalar_for_mul,
+    uint64_t const *has_at_least_one_set_for_div,
+    uint64_t const *has_at_least_one_set_for_mul,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_scalars_for_div, uint32_t num_scalars_for_mul,
+    bool multiplier_exceeds_threshold, bool is_divisor_power_of_two,
+    bool log2_divisor_exceeds_threshold, uint32_t ilog2_divisor,
+    uint64_t divisor, uint64_t shift_pre, uint32_t shift_post, uint64_t rhs,
+    Torus const *clear_blocks, Torus const *h_clear_blocks,
+    uint32_t num_clear_blocks) {
+
+  auto numerator_ct = mem_ptr->numerator_ct;
+  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
+                                     quotient_ct);
+
+  host_integer_unsigned_scalar_div_radix(
+      streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->unsigned_div_mem,
+      ksks, decomposed_scalar_for_div, has_at_least_one_set_for_div,
+      ms_noise_reduction_key, bsks, num_scalars_for_div,
+      multiplier_exceeds_threshold, is_divisor_power_of_two,
+      log2_divisor_exceeds_threshold, ilog2_divisor, shift_pre, shift_post,
+      rhs);
+
+  if (is_divisor_power_of_two) {
+
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
+                                       numerator_ct);
+
+    host_integer_radix_scalar_bitop_kb(
+        streams, gpu_indexes, gpu_count, remainder_ct, remainder_ct,
+        clear_blocks, h_clear_blocks, num_clear_blocks, mem_ptr->bitop_mem,
+        bsks, ksks, ms_noise_reduction_key);
+
+  } else {
+
+    if (divisor != (uint64_t)0) {
+
+      copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                         remainder_ct, quotient_ct);
+
+      if (divisor != (uint64_t)1 && remainder_ct->num_radix_blocks != 0) {
+        host_integer_scalar_mul_radix<Torus>(
+            streams, gpu_indexes, gpu_count, remainder_ct,
+            decomposed_scalar_for_mul, has_at_least_one_set_for_mul,
+            mem_ptr->scalar_mul_mem, bsks, ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars_for_mul);
+      }
+    }
+
+    host_sub_and_propagate_single_carry(
+        streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
+        nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
+        ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
+
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
+                                       numerator_ct);
+  }
+}
+
+template <typename Torus>
+__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, const int_radix_params params,
+    int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, const bool allocate_gpu_memory,
+    uint32_t num_scalar_bits_for_div, uint32_t num_scalar_bits_for_mul,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool l_exceed_threshold, bool is_absolute_divisor_power_of_two,
+    bool is_divisor_zero, bool multiplier_is_small) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      allocate_gpu_memory, true, num_scalar_bits_for_div,
+      num_scalar_bits_for_mul, is_absolute_divisor_one, is_divisor_negative,
+      l_exceed_threshold, is_absolute_divisor_power_of_two, is_divisor_zero,
+      multiplier_is_small, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_integer_signed_scalar_div_rem_radix(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *quotient_ct,
+    CudaRadixCiphertextFFI *remainder_ct,
+    int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    bool is_absolute_divisor_one, bool is_divisor_negative,
+    bool is_divisor_zero, bool l_exceed_threshold,
+    bool is_absolute_divisor_power_of_two, bool multiplier_is_small, uint32_t l,
+    uint32_t shift_post, bool is_rhs_power_of_two, bool is_rhs_zero,
+    bool is_rhs_one, uint32_t rhs_shift, uint32_t divisor_shift,
+    uint32_t numerator_bits, uint32_t num_scalars_for_div,
+    uint32_t num_scalars_for_mul, uint64_t const *decomposed_scalar_for_div,
+    uint64_t const *decomposed_scalar_for_mul,
+    uint64_t const *has_at_least_one_set_for_div,
+    uint64_t const *has_at_least_one_set_for_mul) {
+
+  auto numerator_ct = mem_ptr->numerator_ct;
+  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], numerator_ct,
+                                     quotient_ct);
+
+  host_integer_signed_scalar_div_radix_kb(
+      streams, gpu_indexes, gpu_count, quotient_ct, mem_ptr->signed_div_mem,
+      ksks, bsks, ms_noise_reduction_key, is_absolute_divisor_one,
+      is_divisor_negative, l_exceed_threshold, is_absolute_divisor_power_of_two,
+      multiplier_is_small, l, shift_post, is_rhs_power_of_two, is_rhs_zero,
+      is_rhs_one, rhs_shift, numerator_bits, num_scalars_for_div,
+      decomposed_scalar_for_div, has_at_least_one_set_for_div);
+
+  host_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, quotient_ct, nullptr, nullptr,
+      mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
+      (uint32_t)0);
+
+  if (!is_divisor_negative && is_absolute_divisor_power_of_two) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
+                                       quotient_ct);
+
+    host_integer_radix_logical_scalar_shift_kb_inplace(
+        streams, gpu_indexes, gpu_count, remainder_ct, divisor_shift,
+        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
+        remainder_ct->num_radix_blocks);
+
+  } else if (!is_divisor_zero) {
+
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
+                                       quotient_ct);
+
+    bool is_divisor_one = is_absolute_divisor_one && !is_divisor_negative;
+
+    if (!is_divisor_one && remainder_ct->num_radix_blocks != 0) {
+      host_integer_scalar_mul_radix<Torus>(
+          streams, gpu_indexes, gpu_count, remainder_ct,
+          decomposed_scalar_for_mul, has_at_least_one_set_for_mul,
+          mem_ptr->scalar_mul_mem, bsks, ksks, ms_noise_reduction_key,
+          mem_ptr->params.message_modulus, num_scalars_for_mul);
+    }
+  }
+
+  host_sub_and_propagate_single_carry(
+      streams, gpu_indexes, gpu_count, numerator_ct, remainder_ct, nullptr,
+      nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
+      ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
+
+  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder_ct,
+                                     numerator_ct);
+}
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -29,59 +29,12 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {

-  switch (polynomial_size) {
-  case 512:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-        decomposed_scalar, has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
-        num_scalars);
-    break;
-  case 1024:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-        decomposed_scalar, has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
-        num_scalars);
-    break;
-  case 2048:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-        decomposed_scalar, has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
-        num_scalars);
-    break;
-  case 4096:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-        decomposed_scalar, has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
-        num_scalars);
-    break;
-  case 8192:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-        decomposed_scalar, has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
-        num_scalars);
-    break;
-  case 16384:
-    host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
-        (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-        decomposed_scalar, has_at_least_one_set,
-        reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-        (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
-        num_scalars);
-    break;
-  default:
-    PANIC("Cuda error (scalar multiplication): unsupported polynomial size. "
-          "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
-  }
+  host_integer_scalar_mul_radix<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
+      decomposed_scalar, has_at_least_one_set,
+      reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
+      (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+      num_scalars);
 }

 void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -38,11 +38,11 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_scalar_mul_buffer<T>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      num_scalar_bits, allocate_gpu_memory, true, &size_tracker);
+      num_scalar_bits, allocate_gpu_memory, true, size_tracker);
  return size_tracker;
 }

-template <typename T, class params>
+template <typename T>
 __host__ void host_integer_scalar_mul_radix(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
@@ -80,7 +80,7 @@ __host__ void host_integer_scalar_mul_radix(
    }
  }
  size_t j = 0;
-  for (size_t i = 0; i < min(num_scalars, num_ciphertext_bits); i++) {
+  for (size_t i = 0; i < std::min(num_scalars, num_ciphertext_bits); i++) {
    if (decomposed_scalar[i] == 1) {
      // Perform a block shift
      CudaRadixCiphertextFFI preshifted_radix_ct;
@@ -116,7 +116,7 @@ __host__ void host_integer_scalar_mul_radix(
    set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
                                             lwe_array, 0, num_radix_blocks);
  } else {
-    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
+    host_integer_partial_sum_ciphertexts_vec_kb<T>(
        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
        ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
        num_radix_blocks, j);
@@ -173,8 +173,9 @@ template <typename Torus>
 __host__ void host_integer_radix_scalar_mul_high_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
-    int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
-    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
+    int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    uint64_t rhs, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    void *const *bsks, uint32_t num_scalars) {

@@ -189,7 +190,8 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
  host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
                                                        gpu_indexes);

-  if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
+  if (num_scalars != (uint32_t)0 && rhs != (uint64_t)1 &&
+      tmp_ffi->num_radix_blocks != 0) {
    if ((rhs & (rhs - 1)) == 0) {

      uint32_t shift = std::log2(rhs);
@@ -200,55 +202,56 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);

    } else {
+      host_integer_scalar_mul_radix<Torus>(
+          streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+          has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+          (uint64_t **)ksks, ms_noise_reduction_key,
+          mem_ptr->params.message_modulus, num_scalars);
+    }
+  }

-      switch (mem_ptr->params.polynomial_size) {
-      case 512:
-        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
-            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
-            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
-            (uint64_t **)ksks, ms_noise_reduction_key,
-            mem_ptr->params.message_modulus, num_scalars);
-        break;
-      case 1024:
-        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
-            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
-            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
-            (uint64_t **)ksks, ms_noise_reduction_key,
-            mem_ptr->params.message_modulus, num_scalars);
-        break;
-      case 2048:
-        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
-            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
-            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
-            (uint64_t **)ksks, ms_noise_reduction_key,
-            mem_ptr->params.message_modulus, num_scalars);
-        break;
-      case 4096:
-        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
-            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
-            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
-            (uint64_t **)ksks, ms_noise_reduction_key,
-            mem_ptr->params.message_modulus, num_scalars);
-        break;
-      case 8192:
-        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
-            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
-            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
-            (uint64_t **)ksks, ms_noise_reduction_key,
-            mem_ptr->params.message_modulus, num_scalars);
-        break;
-      case 16384:
-        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
-            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
-            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
-            (uint64_t **)ksks, ms_noise_reduction_key,
-            mem_ptr->params.message_modulus, num_scalars);
-        break;
-      default:
-        PANIC(
-            "Cuda error (scalar multiplication): unsupported polynomial size. "
-            "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
-      }
+  host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_signed_scalar_mul_high_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
+    int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    bool is_rhs_power_of_two, bool is_rhs_zero, bool is_rhs_one,
+    uint32_t rhs_shift, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars) {
+
+  if (is_rhs_zero) {
+    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
+                                                 0, ct->num_radix_blocks);
+    return;
+  }
+
+  CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
+
+  host_extend_radix_with_sign_msb<Torus>(
+      streams, gpu_indexes, gpu_count, tmp_ffi, ct, mem_ptr->extend_radix_mem,
+      ct->num_radix_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
+
+  if (num_scalars != (uint32_t)0 && !is_rhs_one &&
+      tmp_ffi->num_radix_blocks != 0) {
+    if (is_rhs_power_of_two) {
+
+      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+          streams, gpu_indexes, gpu_count, tmp_ffi, rhs_shift,
+          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
+          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
+
+    } else {
+
+      host_integer_scalar_mul_radix<Torus>(
+          streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+          has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+          (uint64_t **)ksks, ms_noise_reduction_key,
+          mem_ptr->params.message_modulus, num_scalars);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -133,7 +133,7 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, params, num_radix_blocks,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -21,7 +21,7 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_shift_and_rotate_buffer<Torus>(
      streams, gpu_indexes, gpu_count, shift_type, is_signed, params,
-      num_radix_blocks, allocate_gpu_memory, &size_tracker);
+      num_radix_blocks, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -23,7 +23,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry(

  *mem_ptr = new int_sub_and_propagate<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);

  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -35,6 +35,20 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
                           static_cast<cudaStream_t>(stream), gpu_index);
 }

+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor) {
+  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
+                               level_count * (1 << grouping_factor) /
+                               grouping_factor;
+  size_t buffer_size =
+      total_polynomials * polynomial_size * sizeof(__uint128_t);
+
+  cuda_memcpy_async_to_gpu((__uint128_t *)dest, (__uint128_t *)src, buffer_size,
+                           static_cast<cudaStream_t>(stream), gpu_index);
+}
+
 // We need these lines so the compiler knows how to specialize these functions
 template __device__ const uint64_t *
 get_ith_mask_kth_block(const uint64_t *ptr, int i, int k, int level,
@@ -80,6 +94,14 @@ template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
                                                    int glwe_dimension,
                                                    uint32_t level_count);

+template __device__ const __uint128_t *
+get_multi_bit_ith_lwe_gth_group_kth_block(const __uint128_t *ptr, int g, int i,
+                                          int k, int level,
+                                          uint32_t grouping_factor,
+                                          uint32_t polynomial_size,
+                                          uint32_t glwe_dimension,
+                                          uint32_t level_count);
+
 template __device__ const uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
    const uint64_t *ptr, int g, int i, int k, int level,
    uint32_t grouping_factor, uint32_t polynomial_size, uint32_t glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -83,6 +83,62 @@ mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
  __syncthreads();
 }

+/** Perform the matrix multiplication between the GGSW and the GLWE,
+ * each block operating on a single level for mask and body.
+ * Both operands should be at fourier domain
+ *
+ * This function assumes:
+ *  - Thread blocks at dimension z relates to the decomposition level.
+ *  - Thread blocks at dimension y relates to the glwe dimension.
+ *  - polynomial_size / params::opt threads are available per block
+ */
+template <typename G, class params>
+__device__ void mul_ggsw_glwe_in_fourier_domain_128(
+    double *fft, double *join_buffer,
+    const double *__restrict__ bootstrapping_key, int iteration, G &group,
+    bool support_dsm = false) {
+  const uint32_t polynomial_size = params::degree;
+  const uint32_t glwe_dimension = gridDim.y - 1;
+  const uint32_t level_count = gridDim.z;
+
+  // The first product is used to initialize level_join_buffer
+  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
+
+  // Continues multiplying fft by every polynomial in that particular bsk level
+  // Each y-block accumulates in a different polynomial at each iteration
+  auto bsk_slice = get_ith_mask_kth_block_128(
+      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
+      glwe_dimension, level_count);
+  for (int j = 0; j < glwe_dimension + 1; j++) {
+    int idx = (j + this_block_rank) % (glwe_dimension + 1);
+
+    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
+    auto buffer_slice = get_join_buffer_element_128<G>(
+        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_product_accumulate_in_fourier_domain_128<params>(
+        buffer_slice, fft, bsk_poly, j == 0);
+    group.sync();
+  }
+
+  // -----------------------------------------------------------------
+  // All blocks are synchronized here; after this sync, level_join_buffer has
+  // the values needed from every other block
+
+  // accumulate rest of the products into fft buffer
+  for (int l = 0; l < level_count; l++) {
+    auto cur_src_acc = get_join_buffer_element_128<G>(
+        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
+        support_dsm);
+
+    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
+                                                        l == 0);
+  }
+
+  __syncthreads();
+}
+
 template <typename Torus>
 void execute_pbs_async(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -223,7 +279,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
                         uint32_t level_count, uint32_t grouping_factor,
                         uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
                         bool allocate_gpu_memory, bool allocate_ms_array,
-                         uint64_t *size_tracker) {
+                         uint64_t &size_tracker) {
  switch (sizeof(Torus)) {
  case sizeof(uint32_t):
    // 32 bits
@@ -231,7 +287,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    case MULTI_BIT:
      PANIC("Error: 32-bit multibit PBS is not supported.\n")
    case CLASSICAL:
-      *size_tracker = scratch_cuda_programmable_bootstrap_32(
+      size_tracker = scratch_cuda_programmable_bootstrap_32(
          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, level_count, input_lwe_ciphertext_count,
          allocate_gpu_memory, allocate_ms_array);
@@ -246,12 +302,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
    case MULTI_BIT:
      if (grouping_factor == 0)
        PANIC("Multi-bit PBS error: grouping factor should be > 0.")
-      *size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
+      size_tracker = scratch_cuda_multi_bit_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
      break;
    case CLASSICAL:
-      *size_tracker = scratch_cuda_programmable_bootstrap_64(
+      size_tracker = scratch_cuda_programmable_bootstrap_64(
          stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
          polynomial_size, level_count, input_lwe_ciphertext_count,
          allocate_gpu_memory, allocate_ms_array);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
@@ -8,9 +8,9 @@ execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
                        uint32_t polynomial_size, uint32_t level_count,
                        uint32_t input_lwe_ciphertext_count,
                        bool allocate_gpu_memory, bool allocate_ms_array,
-                        uint64_t *size_tracker_on_gpu) {
+                        uint64_t &size_tracker_on_gpu) {
  // The squash noise function receives as input 64-bit integers
-  *size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
+  size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
      polynomial_size, level_count, input_lwe_ciphertext_count,
      allocate_gpu_memory, allocate_ms_array);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -286,7 +286,7 @@ __host__ uint64_t scratch_programmable_bootstrap_amortized(
          glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
          max_shared_memory);
  *pbs_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
-      buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
+      buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
  check_cuda_error(cudaGetLastError());
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -225,7 +225,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
-      allocate_gpu_memory, allocate_ms_array, &size_tracker);
+      allocate_gpu_memory, allocate_ms_array, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -280,13 +280,14 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -386,7 +386,7 @@ __host__ uint64_t scratch_programmable_bootstrap(
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
-      allocate_gpu_memory, allocate_ms_array, &size_tracker);
+      allocate_gpu_memory, allocate_ms_array, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -18,62 +18,6 @@
 #include "programmable_bootstrap.cuh"
 #include "types/complex/operations.cuh"

-/** Perform the matrix multiplication between the GGSW and the GLWE,
- * each block operating on a single level for mask and body.
- * Both operands should be at fourier domain
- *
- * This function assumes:
- *  - Thread blocks at dimension z relates to the decomposition level.
- *  - Thread blocks at dimension y relates to the glwe dimension.
- *  - polynomial_size / params::opt threads are available per block
- */
-template <typename G, class params>
-__device__ void mul_ggsw_glwe_in_fourier_domain_128(
-    double *fft, double *join_buffer,
-    const double *__restrict__ bootstrapping_key, int iteration, G &group,
-    bool support_dsm = false) {
-  const uint32_t polynomial_size = params::degree;
-  const uint32_t glwe_dimension = gridDim.y - 1;
-  const uint32_t level_count = gridDim.z;
-
-  // The first product is used to initialize level_join_buffer
-  auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
-
-  // Continues multiplying fft by every polynomial in that particular bsk level
-  // Each y-block accumulates in a different polynomial at each iteration
-  auto bsk_slice = get_ith_mask_kth_block_128(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.z, polynomial_size,
-      glwe_dimension, level_count);
-  for (int j = 0; j < glwe_dimension + 1; j++) {
-    int idx = (j + this_block_rank) % (glwe_dimension + 1);
-
-    auto bsk_poly = bsk_slice + idx * polynomial_size / 2 * 4;
-    auto buffer_slice = get_join_buffer_element_128<G>(
-        blockIdx.z, idx, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_product_accumulate_in_fourier_domain_128<params>(
-        buffer_slice, fft, bsk_poly, j == 0);
-    group.sync();
-  }
-
-  // -----------------------------------------------------------------
-  // All blocks are synchronized here; after this sync, level_join_buffer has
-  // the values needed from every other block
-
-  // accumulate rest of the products into fft buffer
-  for (int l = 0; l < level_count; l++) {
-    auto cur_src_acc = get_join_buffer_element_128<G>(
-        l, blockIdx.y, group, join_buffer, polynomial_size, glwe_dimension,
-        support_dsm);
-
-    polynomial_accumulate_in_fourier_domain_128<params>(fft, cur_src_acc,
-                                                        l == 0);
-  }
-
-  __syncthreads();
-}
-
 template <typename InputTorus, class params, sharedMemDegree SMD,
          bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
@@ -174,9 +118,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
                                               accumulator);
  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);

-  // We are using the same memory space for accumulator_fft and
-  // accumulator_rotated, so we need to synchronize here to make sure they
-  // don't modify the same memory space at the same time
  // Switch to the FFT space
  auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
  auto acc_fft_re_lo = accumulator_fft + 1 * params::degree / 2;
@@ -455,6 +396,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {

+  cuda_set_device(gpu_index);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
          polynomial_size);
@@ -484,7 +426,7 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
-      allocate_gpu_memory, allocate_ms_array, &size_tracker);
+      allocate_gpu_memory, allocate_ms_array, size_tracker);
  return size_tracker;
 }

@@ -591,7 +533,7 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
-      allocate_gpu_memory, allocate_ms_array, &size_tracker);
+      allocate_gpu_memory, allocate_ms_array, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -455,11 +455,8 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
 */
 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size) {
-
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
-          polynomial_size);
+                            uint32_t polynomial_size,
+                            uint64_t full_sm_keybundle) {

  int max_blocks_per_sm;
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -521,13 +521,14 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<Torus, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cu
@@ -0,0 +1,361 @@
+#include "programmable_bootstrap_cg_multibit.cuh"
+#include "programmable_bootstrap_multibit_128.cuh"
+
+template <typename InputTorus>
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 512:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 1024:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 2048:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 4096:
+    return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                        AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename InputTorus>
+uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  switch (polynomial_size) {
+  case 256:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 512:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 1024:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 2048:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  case 4096:
+    return scratch_cg_multi_bit_programmable_bootstrap_128<
+        InputTorus, AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
+        polynomial_size, level_count, input_lwe_ciphertext_count,
+        allocate_gpu_memory);
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
+
+  bool supports_cg =
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<
+          __uint128_t>(glwe_dimension, polynomial_size, level_count,
+                       input_lwe_ciphertext_count,
+                       cuda_get_max_shared_memory(gpu_index));
+
+  if (supports_cg)
+    return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
+        stream, gpu_index,
+        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+  else
+    return scratch_cuda_multi_bit_programmable_bootstrap_128<uint64_t>(
+        stream, gpu_index,
+        reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> **>(buffer),
+        glwe_dimension, polynomial_size, level_count,
+        input_lwe_ciphertext_count, allocate_gpu_memory);
+}
+
+template <typename InputTorus>
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_multi_bit_programmable_bootstrap_128<InputTorus,
+                                              AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+template <typename InputTorus>
+void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
+    InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
+    InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    uint32_t num_many_lut, uint32_t lut_stride) {
+
+  switch (polynomial_size) {
+  case 256:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<256>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 512:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<512>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 1024:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<1024>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 2048:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<2048>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  case 4096:
+    host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
+                                                 AmortizedDegree<4096>>(
+        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
+        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
+        lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
+        lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
+        num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
+          "N's are powers of two"
+          " in the interval [256..4096].")
+  }
+}
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
+
+  if (base_log > 64)
+    PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
+
+  auto *buffer =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(mem_ptr);
+  switch (buffer->pbs_variant) {
+  case PBS_VARIANT::CG:
+    cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<
+        uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+                  static_cast<const uint64_t *>(lwe_output_indexes),
+                  static_cast<const __uint128_t *>(lut_vector),
+                  static_cast<const uint64_t *>(lut_vector_indexes),
+                  static_cast<const uint64_t *>(lwe_array_in),
+                  static_cast<const uint64_t *>(lwe_input_indexes),
+                  static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+                  lwe_dimension, glwe_dimension, polynomial_size,
+                  grouping_factor, base_log, level_count, num_samples,
+                  num_many_lut, lut_stride);
+    break;
+  case PBS_VARIANT::DEFAULT:
+    cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
+        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+        static_cast<const uint64_t *>(lwe_output_indexes),
+        static_cast<const __uint128_t *>(lut_vector),
+        static_cast<const uint64_t *>(lut_vector_indexes),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(lwe_input_indexes),
+        static_cast<const __uint128_t *>(bootstrapping_key), buffer,
+        lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
+        base_log, level_count, num_samples, num_many_lut, lut_stride);
+    break;
+  default:
+    PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
+  }
+}
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
+                                                       const uint32_t gpu_index,
+                                                       int8_t **buffer) {
+  const auto x =
+      reinterpret_cast<pbs_buffer_128<uint64_t, MULTI_BIT> *>(*buffer);
+  x->release(static_cast<cudaStream_t>(stream), gpu_index);
+}
+
+/**
+ * Computes divisors of the product of num_sms (streaming multiprocessors on the
+ * GPU) and max_blocks_per_sm (maximum active blocks per SM to launch
+ * device_multi_bit_programmable_bootstrap_keybundle) smaller than its square
+ * root, based on max_num_pbs. If log2(max_num_pbs) <= 13, selects the first
+ * suitable divisor. If greater, calculates an offset as max(1,log2(max_num_pbs)
+ * - 13) for additional logic.
+ *
+ * The value 13 was empirically determined based on memory requirements for
+ * benchmarking on an RTX 4090 GPU, balancing performance and resource use.
+ */
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
+                                uint32_t polynomial_size,
+                                uint64_t full_sm_keybundle) {
+
+  int max_blocks_per_sm;
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  cuda_set_device(gpu_index);
+  if (max_shared_memory < full_sm_keybundle)
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
+                                                              NOSM>,
+        polynomial_size / params::opt, full_sm_keybundle);
+  else
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_blocks_per_sm,
+        device_multi_bit_programmable_bootstrap_keybundle_128<Torus, params,
+                                                              FULLSM>,
+        polynomial_size / params::opt, 0);
+
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  int x = num_sms * max_blocks_per_sm;
+  int count = 0;
+
+  int divisor = 1;
+  int ith_divisor = 0;
+
+#if CUDA_ARCH < 900
+  // We pick a smaller divisor on GPUs other than H100, so 256-bit integer
+  // multiplication can run
+  int log2_max_num_pbs = log2_int(max_num_pbs);
+  if (log2_max_num_pbs > 13)
+    ith_divisor = log2_max_num_pbs - 11;
+#endif
+
+  for (int i = sqrt(x); i >= 1; i--) {
+    if (x % i == 0) {
+      if (count == ith_divisor) {
+        divisor = i;
+        break;
+      } else {
+        count++;
+      }
+    }
+  }
+
+  return divisor;
+}
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit_128.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -253,7 +253,7 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
  *buffer = new pbs_buffer<Torus, CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::TBC,
-      allocate_gpu_memory, allocate_ms_array, &size_tracker);
+      allocate_gpu_memory, allocate_ms_array, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -283,13 +283,14 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
    check_cuda_error(cudaGetLastError());
  }

-  auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-      gpu_index, input_lwe_ciphertext_count, polynomial_size);
+  auto lwe_chunk_size =
+      get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
+                                        polynomial_size, full_sm_keybundle);
  uint64_t size_tracker = 0;
  *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
      stream, gpu_index, glwe_dimension, polynomial_size, level_count,
      input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper.cuh
@@ -5,15 +5,15 @@
 #include <stdio.h>
 #include <type_traits>

-template <typename T> inline __device__ const char *get_format();
+template <typename T> __device__ inline const char *get_format();

-template <> inline __device__ const char *get_format<int>() { return "%d, "; }
+template <> __device__ inline const char *get_format<int>() { return "%d, "; }

-template <> inline __device__ const char *get_format<unsigned int>() {
+template <> __device__ inline const char *get_format<unsigned int>() {
  return "%u, ";
 }

-template <> inline __device__ const char *get_format<uint64_t>() {
+template <> __device__ inline const char *get_format<uint64_t>() {
  return "%lu, ";
 }

@@ -23,6 +23,15 @@ template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
  }
 }

+template <>
+__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
+  for (int i = 0; i < N; i++) {
+    uint64_t low = static_cast<uint64_t>(src[i]);
+    uint64_t high = static_cast<uint64_t>(src[i] >> 64);
+    printf("(%llu, %llu), ", high, low);
+  }
+}
+
 template <>
 __global__ inline void print_debug_kernel(const double2 *src, int N) {
  for (int i = 0; i < N; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -39,10 +39,12 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
 }

 int get_active_gpu_count(int num_inputs, int gpu_count) {
-  int active_gpu_count = gpu_count;
-  if (gpu_count > num_inputs) {
-    active_gpu_count = num_inputs;
-  }
+  int threshold_number_of_inputs = 10;
+  int ceil_div_inputs = std::max(1, (num_inputs + threshold_number_of_inputs - 1) /
+                        threshold_number_of_inputs);
+  printf("ceil div inputs: %dn, gpu_count: %d\n", ceil_div_inputs, gpu_count);
+  int active_gpu_count = std::min(ceil_div_inputs, gpu_count);
+  printf("active gpus: %d\n", active_gpu_count);
  return active_gpu_count;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -9,7 +9,7 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
                                 uint32_t gpu_count, std::vector<Torus *> &dest,
                                 uint32_t elements_per_gpu,
-                                 uint64_t *size_tracker_on_gpu_0,
+                                 uint64_t &size_tracker_on_gpu_0,
                                 bool allocate_gpu_memory) {

  dest.resize(gpu_count);
@@ -17,10 +17,10 @@ void multi_gpu_alloc_array_async(cudaStream_t const *streams,
    uint64_t size_tracker_on_gpu_i = 0;
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i],
-        &size_tracker_on_gpu_i, allocate_gpu_memory);
+        size_tracker_on_gpu_i, allocate_gpu_memory);
    dest[i] = d_array;
-    if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
-      *size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
+    if (i == 0) {
+      size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
    }
  }
 }
@@ -46,7 +46,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
                               uint32_t const *gpu_indexes, uint32_t gpu_count,
                               std::vector<Torus *> &dest, uint32_t num_inputs,
                               uint32_t lwe_size,
-                               uint64_t *size_tracker_on_gpu_0,
+                               uint64_t &size_tracker_on_gpu_0,
                               bool allocate_gpu_memory) {
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
@@ -54,10 +54,10 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
-        &size_tracker_on_gpu_i, allocate_gpu_memory);
+        size_tracker_on_gpu_i, allocate_gpu_memory);
    dest[i] = d_array;
-    if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
-      *size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
+    if (i == 0) {
+      size_tracker_on_gpu_0 += size_tracker_on_gpu_i;
    }
  }
 }
@@ -65,7 +65,7 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
 template void multi_gpu_alloc_lwe_async<__uint128_t>(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
-    uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
+    uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
    bool allocate_gpu_memory);

 /// Allocates the input/output vector for all devices
@@ -75,7 +75,7 @@ template <typename Torus>
 void multi_gpu_alloc_lwe_many_lut_output_async(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, std::vector<Torus *> &dest, uint32_t num_inputs,
-    uint32_t num_many_lut, uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
+    uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
    bool allocate_gpu_memory) {
  dest.resize(gpu_count);
  for (uint i = 0; i < gpu_count; i++) {
@@ -83,10 +83,10 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
-        gpu_indexes[i], &size_tracker, allocate_gpu_memory);
+        gpu_indexes[i], size_tracker, allocate_gpu_memory);
    dest[i] = d_array;
-    if (i == 0 && size_tracker_on_gpu_0 != nullptr) {
-      *size_tracker_on_gpu_0 += size_tracker;
+    if (i == 0) {
+      size_tracker_on_gpu_0 += size_tracker;
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -94,7 +94,7 @@ __host__ uint64_t scratch_cuda_expand_without_verification(
  *mem_ptr = new zk_expand_mem<Torus>(
      streams, gpu_indexes, gpu_count, computing_params, casting_params,
      casting_key_type, num_lwes_per_compact_list, is_boolean_array,
-      num_compact_lists, allocate_gpu_memory, &size_tracker);
+      num_compact_lists, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -1531,6 +1531,207 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_integer_signed_scalar_div_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        num_scalar_bits: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        is_absolute_divisor_one: bool,
+        is_divisor_negative: bool,
+        l_exceed_threshold: bool,
+        is_power_of_two: bool,
+        multiplier_is_small: bool,
+        allocate_ms_array: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_signed_scalar_div_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        numerator_ct: *mut CudaRadixCiphertextFFI,
+        mem_ptr: *mut i8,
+        ksks: *const *mut ffi::c_void,
+        bsks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
+        is_absolute_divisor_one: bool,
+        is_divisor_negative: bool,
+        l_exceed_threshold: bool,
+        is_power_of_two: bool,
+        multiplier_is_small: bool,
+        l: u32,
+        shift_post: u32,
+        is_rhs_power_of_two: bool,
+        is_rhs_zero: bool,
+        is_rhs_one: bool,
+        rhs_shift: u32,
+        numerator_bits: u32,
+        num_scalars: u32,
+        decomposed_scalar: *const u64,
+        has_at_least_one_set: *const u64,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        is_divisor_power_of_two: bool,
+        log2_divisor_exceeds_threshold: bool,
+        multiplier_exceeds_threshold: bool,
+        num_scalar_bits_for_div: u32,
+        num_scalar_bits_for_mul: u32,
+        ilog2_divisor: u32,
+        divisor: u64,
+        allocate_ms_array: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        quotient_ct: *mut CudaRadixCiphertextFFI,
+        remainder_ct: *mut CudaRadixCiphertextFFI,
+        mem_ptr: *mut i8,
+        ksks: *const *mut ffi::c_void,
+        bsks: *const *mut ffi::c_void,
+        decomposed_scalar_for_div: *const u64,
+        decomposed_scalar_for_mul: *const u64,
+        has_at_least_one_set_for_div: *const u64,
+        has_at_least_one_set_for_mul: *const u64,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
+        num_scalars_for_div: u32,
+        num_scalars_for_mul: u32,
+        multiplier_exceeds_threshold: bool,
+        is_divisor_power_of_two: bool,
+        log2_divisor_exceeds_threshold: bool,
+        ilog2_divisor: u32,
+        divisor: u64,
+        shift_pre: u64,
+        shift_post: u32,
+        rhs: u64,
+        clear_blocks: *const ffi::c_void,
+        h_clear_blocks: *const ffi::c_void,
+        num_clear_blocks: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+unsafe extern "C" {
+    pub fn scratch_integer_signed_scalar_div_rem_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+        num_scalar_bits_for_div: u32,
+        num_scalar_bits_for_mul: u32,
+        is_absolute_divisor_one: bool,
+        is_divisor_negative: bool,
+        l_exceed_threshold: bool,
+        is_absolute_divisor_power_of_two: bool,
+        is_divisor_zero: bool,
+        multiplier_is_small: bool,
+        allocate_ms_array: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_integer_signed_scalar_div_rem_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        quotient_ct: *mut CudaRadixCiphertextFFI,
+        remainder_ct: *mut CudaRadixCiphertextFFI,
+        mem_ptr: *mut i8,
+        ksks: *const *mut ffi::c_void,
+        bsks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
+        is_absolute_divisor_one: bool,
+        is_divisor_negative: bool,
+        is_divisor_zero: bool,
+        l_exceed_threshold: bool,
+        is_absolute_divisor_power_of_two: bool,
+        multiplier_is_small: bool,
+        l: u32,
+        shift_post: u32,
+        is_rhs_power_of_two: bool,
+        is_rhs_zero: bool,
+        is_rhs_one: bool,
+        rhs_shift: u32,
+        divisor_shift: u32,
+        numerator_bits: u32,
+        num_scalars_for_div: u32,
+        num_scalars_for_mul: u32,
+        decomposed_scalar_for_div: *const u64,
+        decomposed_scalar_for_mul: *const u64,
+        has_at_least_one_set_for_div: *const u64,
+        has_at_least_one_set_for_mul: *const u64,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;
@@ -2125,6 +2326,19 @@ unsafe extern "C" {
        grouping_factor: u32,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        dest: *mut ffi::c_void,
+        src: *const ffi::c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
        stream: *mut ffi::c_void,
@@ -2167,3 +2381,45 @@ unsafe extern "C" {
        pbs_buffer: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    ) -> u64;
+}
+unsafe extern "C" {
+    pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_output_indexes: *const ffi::c_void,
+        lut_vector: *const ffi::c_void,
+        lut_vector_indexes: *const ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        lwe_input_indexes: *const ffi::c_void,
+        bootstrapping_key: *const ffi::c_void,
+        mem_ptr: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+        num_many_lut: u32,
+        lut_stride: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cleanup_cuda_multi_bit_programmable_bootstrap_128(
+        stream: *mut ffi::c_void,
+        gpu_index: u32,
+        buffer: *mut *mut i8,
+    );
+}
--- a/backends/tfhe-hpu-backend/README.md
+++ b/backends/tfhe-hpu-backend/README.md
@@ -201,9 +201,9 @@ There are some example applications already available in `tfhe/examples/hpu`:
 In order to run those applications on hardware, user must build from the project root (i.e `tfhe-rs-internal`) with `hpu-v80` features:

 > NB: Running examples required to have correctly pulled the `.hpu` files. Those files, due to their size, are backed by git-lfs and disabled by default.
-> In order to retrieve them, use the following command:
+> In order to retrieve them, run the following command from **TFHE-rs** root folder:
 > ```bash
-> git lfs pull --include="*" --exclude=""
+> make pull_hpu_files
 > ```

 ``` bash
@@ -217,7 +217,7 @@ source setup_hpu.sh --config v80
 > NB: Error that occurred when ".hpu" files weren't correctly fetch could be a bit enigmatic: `memory allocation of ... bytes failed`
 > If you encountered this issue, you should run the following command:
 > ```bash
-> git lfs pull --include="*" --exclude=""
+> make pull_hpu_files
 > ```


--- a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp_log.rs
+++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp_log.rs
@@ -482,6 +482,7 @@ pub fn iop_propagate_msb_to_lsb_blockv(
    // (op_nb_bool**k)*proc_nb
    //assert_eq!(g_a.len(),props.blk_w());
    let grp_nb = g_a.len().div_ceil(proc_nb);
+    let mut level_nb = 0;
    let mut stride_size: usize = 1; // in group unit
    while stride_size < grp_nb {
        for chk in g_a.chunks_mut(op_nb_bool * stride_size * proc_nb) {
@@ -499,31 +500,69 @@ pub fn iop_propagate_msb_to_lsb_blockv(
        }

        stride_size *= op_nb_bool;
+        level_nb += 1;
    }

+    // This code was written for a limited size, due the following
+    // leveled additions.
+    assert!(level_nb < op_nb_bool);
+
    // Third step
    // Apply
-    g_a.chunks_mut(proc_nb).rev().fold(None, |acc, chk| {
-        if let Some(x) = acc {
+    let mut neigh_a: Vec<metavar::MetaVarCell> = Vec::new();
+    for _i in 1..level_nb {
+        neigh_a.push(prog.new_cst(0));
+    }
+
+    let mut neigh = prog.new_cst(0);
+    let mut prev = None;
+    g_a.chunks_mut(proc_nb)
+        .enumerate()
+        .rev()
+        .for_each(|(chk_idx, chk)| {
+            let keep_v0 = chk[0].clone();
+
+            let all_neigh = if let Some(x) = &prev {
+                &neigh + x
+            } else {
+                neigh.clone()
+            };
+
            for (idx, v) in chk.iter_mut().enumerate() {
                if idx == 0 {
-                    // [0] is already complete.
-                    // Need to inverse it for 0 if needed
-                    if inverse_output.unwrap_or(false) {
-                        *v = v.pbs(&pbs_is_null, false);
-                    }
+                    // [0] is already complete with prev.
+                    // do not need to add prev
+                    *v = &*v + &neigh;
                } else {
-                    *v = &*v + x;
-                    if inverse_output.unwrap_or(false) {
-                        *v = v.pbs(&pbs_is_null, false);
-                    } else {
-                        *v = v.pbs(&pbs_not_null, false);
-                    }
+                    *v = &*v + &all_neigh;
+                }
+                // Need to inverse it for 0 if needed
+                if inverse_output.unwrap_or(false) {
+                    *v = v.pbs(&pbs_is_null, false);
+                } else {
+                    *v = v.pbs(&pbs_not_null, false);
                }
            }
-        }
-        Some(&chk[0])
-    });
+
+            // For next chunk
+            prev = Some(keep_v0.clone());
+
+            // Update neighbors for next iteration
+            let mut do_update_neigh = false;
+            for i in 1..(level_nb as u32) {
+                if (chk_idx % op_nb_bool.pow(i)) == 0 {
+                    // Update the corresponding neigh value
+                    neigh_a[(i - 1) as usize] = keep_v0.clone();
+                    do_update_neigh = true;
+                }
+            }
+            if do_update_neigh {
+                neigh = neigh_a[0].clone();
+                for n in neigh_a.iter().skip(1) {
+                    neigh = &neigh + n;
+                }
+            }
+        });

    if inverse_direction.unwrap_or(false) {
        g_a.reverse();
--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -12,5 +12,5 @@
  "n3-H100x4": 6.08,
  "n3-H100x2": 3.04,
  "n3-L40x1": 0.80,
-  "n3-H100x8-SXM5": 24
+  "n3-H100x8-SXM5": 19.2
 }
--- a/scripts/backward_compat_data_version.py
+++ b/scripts/backward_compat_data_version.py
@@ -1,17 +0,0 @@
-#!/usr/bin/python3
-
-try:
-    import tomllib  # Python v3.11+
-except ModuleNotFoundError:
-    import pip._vendor.tomli as tomllib  # the same tomllib that's now included in Python v3.11+
-
-
-fname = "tests/Cargo.toml"
-with open(fname, "rb") as f:
-    data = tomllib.load(f)
-
-    dev_dependencies = data["dev-dependencies"]
-
-    branch_name = dev_dependencies["tfhe-backward-compat-data"]["branch"]
-
-    print(branch_name)
--- a/scripts/check_doc_paths_use_dash.py
+++ b/scripts/check_doc_paths_use_dash.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+# Verify there are not underscores in docs dirs to avoid issues between github and gitbook.
+# The mix of both was creating more issues than necessary, so using the least common denominator of
+# the "-" instead of "_"
+
+from pathlib import Path
+import os
+
+DEBUG = False
+
+
+def main():
+    curr_file_path = Path(__file__)
+    root_dir = curr_file_path.parent.parent.resolve()
+    docs_dir = root_dir / "tfhe/docs"
+
+    if not docs_dir.exists():
+        raise ValueError(f"{docs_dir} does not exist")
+
+    problems = []
+
+    for idx, (subdirs, dirs, files) in enumerate(os.walk(docs_dir)):
+        if DEBUG:
+            print(idx, (subdirs, dirs, files))
+
+        subdirs = Path(subdirs).resolve()
+
+        for dir_ in dirs:
+            if "_" in str(dir_):
+                problems.append(
+                    f"Found dir: {dir_} in {subdirs} containing a '_' instead of a '-', "
+                    f"this is not allowed"
+                )
+
+        for file in files:
+            if "_" in str(file):
+                problems.append(
+                    f"Found file: {file} in {subdirs} containing a '_' instead of a '-', "
+                    f"this is not allowed"
+                )
+
+    if len(problems) != 0:
+        for problem in problems:
+            print(problem)
+
+        raise ValueError
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/clone_backward_compat_data.sh
+++ b/scripts/clone_backward_compat_data.sh
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-if [ $# -lt 3 ]; then
-		echo "invalid arguments, usage:\n"
-		echo "$0 git_url branch dest_path"
-		exit 1
-fi
-
-if ! git lfs env 2>/dev/null >/dev/null; then
-		echo "git lfs is not installed, please install it and try again"
-		exit 1
-fi
-
-if [ -d $3 ]; then
-		cd $3 && git remote set-branches origin '*' && git fetch --depth 1 && git reset --hard origin/$2 && git clean -dfx
-
-else
-		git clone $1 -b $2 --depth 1 $3
-fi
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -27,6 +27,7 @@ fast_tests_argument=
 long_tests_argument=
 nightly_tests_argument=
 no_big_params_argument=
+no_big_params_argument_gpu=
 cargo_profile="release"
 backend="cpu"
 gpu_feature=""
@@ -107,6 +108,10 @@ if [[ "${NO_BIG_PARAMS}" == TRUE ]]; then
    no_big_params_argument=--no-big-params
 fi

+if [[ "${NO_BIG_PARAMS_GPU}" == TRUE ]]; then
+    no_big_params_argument_gpu=--no-big-params-gpu
+fi
+
 if [[ "${backend}" == "gpu" ]]; then
    gpu_feature="gpu"
 fi
@@ -145,7 +150,7 @@ if [[ "${backend}" == "gpu" ]]; then
    fi
 fi

-filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${long_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})
+filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument:+$fast_tests_argument} ${long_tests_argument:+$long_tests_argument} ${nightly_tests_argument:+$nightly_tests_argument} ${no_big_params_argument_gpu:+$no_big_params_argument_gpu} ${multi_bit_argument:+$multi_bit_argument} ${sign_argument:+$sign_argument} ${no_big_params_argument:+$no_big_params_argument})

 if [[ "${FAST_TESTS}" == "TRUE" ]]; then
    echo "Running 'fast' test set"
--- a/scripts/pull_lfs_data.sh
+++ b/scripts/pull_lfs_data.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+set -e
+
+if [ $# -lt 1 ]; then
+		echo "invalid arguments, usage:\n"
+		echo "$0 <data_path>"
+		exit 1
+fi
+
+if ! git lfs env 2>/dev/null >/dev/null; then
+		echo "git lfs is not installed, please install it and try again"
+		exit 1
+fi
+
+git lfs pull --include="$1/*" --exclude=""
--- a/scripts/test_filtering.py
+++ b/scripts/test_filtering.py
@@ -68,6 +68,12 @@ parser.add_argument(
    action="store_true",
    help="Do not run tests with big parameters set (e.g. 4bits message with 4 bits carry)",
 )
+parser.add_argument(
+    "--no-big-params-gpu",
+    dest="no_big_params_gpu",
+    action="store_true",
+    help="Do not run tests with big parameters set (e.g. 3bits message with 3 bits carry) for GPU",
+)

 # block PBS are too slow for high params
 # mul_crt_4_4 is extremely flaky (~80% failure)
@@ -101,6 +107,10 @@ EXCLUDED_BIG_PARAMETERS = [
    "/.*_param_message_4_carry_4_ks_pbs_gaussian_2m64$/",
 ]

+EXCLUDED_BIG_PARAMETERS_GPU = [
+    "/.*_message_3_carry_3.*$/",
+    "/.*_group_3_message_2_carry_2.*$/",
+]

 def filter_integer_tests(input_args):
    (multi_bit_filter, group_filter) = (
@@ -130,6 +140,10 @@ def filter_integer_tests(input_args):
            for pattern in EXCLUDED_BIG_PARAMETERS:
                filter_expression.append(f"not test({pattern})")

+        if input_args.no_big_params_gpu:
+            for pattern in EXCLUDED_BIG_PARAMETERS_GPU:
+                filter_expression.append(f"not test({pattern})")
+
        if input_args.fast_tests and input_args.nightly_tests:
            filter_expression.append(
                f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_[2-3]_carry_[2-3]_.*/)"
--- a/tasks/src/check_tfhe_docs_are_tested.rs
+++ b/tasks/src/check_tfhe_docs_are_tested.rs
@@ -3,16 +3,11 @@ use std::collections::HashSet;
 use std::io::{Error, ErrorKind};

 // TODO use .gitignore or git to resolve ignored files
-const DIR_TO_IGNORE: [&str; 3] = [
-    ".git",
-    "target",
-    // If the data repo has been cloned, we ignore its README
-    "tests/tfhe-backward-compat-data",
-];
+const DIR_TO_IGNORE: [&str; 2] = [".git", "target"];

-const FILES_TO_IGNORE: [&str; 8] = [
+const FILES_TO_IGNORE: [&str; 9] = [
    // This contains fragments of code that are unrelated to TFHE-rs
-    "tfhe/docs/tutorials/sha256_bool.md",
+    "tfhe/docs/tutorials/sha256-bool.md",
    // TODO: This contains code that could be executed as a trivium docstring
    "apps/trivium/README.md",
    // TODO: should we test this ?
@@ -24,6 +19,7 @@ const FILES_TO_IGNORE: [&str; 8] = [
    "utils/tfhe-lints/README.md",
    "CONTRIBUTING.md",
    "backends/tfhe-hpu-backend/README.md",
+    "utils/tfhe-backward-compat-data/README.md",
 ];

 pub fn check_tfhe_docs_are_tested() -> Result<(), Error> {
@@ -123,7 +119,17 @@ pub fn check_tfhe_docs_are_tested() -> Result<(), Error> {
    }

    for value_to_remove in FILES_TO_IGNORE {
-        let path_to_remove = curr_dir.join(value_to_remove).canonicalize()?.to_path_buf();
+        let file_to_ignore = curr_dir.join(value_to_remove);
+        if !file_to_ignore.exists() {
+            return Err(Error::new(
+                ErrorKind::InvalidData,
+                format!(
+                    "Encountered errors while ignoring files: {} does not exist",
+                    file_to_ignore.display()
+                ),
+            ));
+        }
+        let path_to_remove = file_to_ignore.canonicalize()?.to_path_buf();
        doc_files.remove(&path_to_remove);
    }

--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -7,7 +7,7 @@ publish = false
 [dev-dependencies]
 tfhe = { path = "../tfhe" }
 tfhe-versionable = { path = "../utils/tfhe-versionable" }
-tfhe-backward-compat-data = { git = "https://github.com/zama-ai/tfhe-backward-compat-data.git", branch = "v0.8", default-features = false, features = [
+tfhe-backward-compat-data = { path = "../utils/tfhe-backward-compat-data", default-features = false, features = [
    "load",
 ] }
 cargo_toml = "0.22"
--- a/tests/backward_compatibility_tests.rs
+++ b/tests/backward_compatibility_tests.rs
@@ -1,5 +1,5 @@
-//! Tests breaking change in serialized data by trying to load historical data stored in https://github.com/zama-ai/tfhe-backward-compat-data.
-//! For each tfhe-rs module, there is a folder with some serialized messages and a [ron](https://github.com/ron-rs/ron)
+//! Tests breaking change in serialized data by trying to load historical data stored with git LFS
+//! inside `utils/tfhe-backward-compat-data`. For each tfhe-rs module, there is a folder with some serialized messages and a [ron](https://github.com/ron-rs/ron)
 //! file. The ron file stores some metadata that are parsed in this test. These metadata tell us
 //! what to test for each message.

@@ -22,11 +22,13 @@ fn test_data_dir() -> PathBuf {
    } else {
        PathBuf::from_str(env!("CARGO_MANIFEST_DIR"))
            .unwrap()
+            .join("..")
+            .join("utils")
            .join("tfhe-backward-compat-data")
    };

    if !root_dir.exists() {
-        panic!("Missing backward compatibility test data. Clone them using `make clone_backward_compat_data`")
+        panic!("Wrong backward compat data folder: {}", root_dir.display())
    }

    data_dir(root_dir)
@@ -103,6 +105,13 @@ fn run_all_tests<M: TestedModule>(base_dir: &Path) -> Vec<TestResult> {
    // If we ran 0 test, it is likely that something wrong happened
    assert!(!results.is_empty());

+    if results.iter().all(|res| res.is_failure()) {
+        println!(
+            "\nAll tests failed. Maybe the backward compatibility data files are missing. \
+Pull them using `make pull_backward_compat_data`"
+        )
+    }
+
    results
 }

--- a/tfhe-benchmark/Cargo.toml
+++ b/tfhe-benchmark/Cargo.toml
@@ -84,6 +84,12 @@ path = "benches/high_level_api/dex.rs"
 harness = false
 required-features = ["integer", "internal-keycache"]

+[[bench]]
+name = "hlapi-noise-squash"
+path = "benches/high_level_api/noise_squash.rs"
+harness = false
+required-features = ["integer", "internal-keycache"]
+
 [[bench]]
 name = "glwe_packing_compression-integer-bench"
 path = "benches/integer/glwe_packing_compression.rs"
--- a/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/ks_pbs_bench.rs
@@ -640,7 +640,6 @@ mod cuda {
                                    &cuda_indexes.d_lut,
                                    &cuda_indexes.d_output,
                                    &cuda_indexes.d_input,
-                                    LweCiphertextCount(1),
                                    gpu_keys.bsk.as_ref().unwrap(),
                                    &streams,
                                );
@@ -793,7 +792,6 @@ mod cuda {
                                                &cuda_indexes_vec[i].d_lut,
                                                &cuda_indexes_vec[i].d_output,
                                                &cuda_indexes_vec[i].d_input,
-                                                LweCiphertextCount(1),
                                                gpu_keys_vec[i].bsk.as_ref().unwrap(),
                                                local_stream,
                                            );
--- a/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs128_bench.rs
@@ -165,7 +165,7 @@ fn pbs_128(c: &mut Criterion) {
 mod cuda {
    use benchmark::utilities::{
        cuda_local_keys_core, cuda_local_streams_core, get_bench_type, throughput_num_threads,
-        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord,
+        write_to_json, BenchmarkType, CpuKeys, CpuKeysBuilder, CryptoParametersRecord, CudaIndexes,
        CudaLocalKeys, OperatorType,
    };
    use criterion::{black_box, Criterion, Throughput};
@@ -173,13 +173,15 @@ mod cuda {
    use tfhe::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
    use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
    use tfhe::core_crypto::gpu::{
+        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext,
        cuda_programmable_bootstrap_128_lwe_ciphertext, get_number_of_gpus, CudaStreams,
    };
    use tfhe::core_crypto::prelude::*;
    use tfhe::shortint::engine::ShortintEngine;
    use tfhe::shortint::parameters::{
-        ModulusSwitchType, NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
-        PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        ModulusSwitchType, NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
    };
    use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;

@@ -191,8 +193,8 @@ mod cuda {
            .measurement_time(std::time::Duration::from_secs(30));

        type Scalar = u128;
-        let input_params = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
-        let squash_params = NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let input_params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let squash_params = NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

        let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
        let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
@@ -302,7 +304,6 @@ mod cuda {
                                &lwe_ciphertext_in_gpu,
                                &mut out_pbs_ct_gpu,
                                &accumulator_gpu,
-                                LweCiphertextCount(1),
                                gpu_keys.bsk.as_ref().unwrap(),
                                &streams,
                            );
@@ -398,13 +399,277 @@ mod cuda {
                                .zip(accumulators.par_iter())
                                .zip(local_streams.par_iter())
                                .for_each(
-                                    |((((i, input_ct), output_ct), accumulator), local_stream)| {
+                                    |(
+                                        (((i, input_batch), output_batch), accumulator),
+                                        local_stream,
+                                    )| {
                                        cuda_programmable_bootstrap_128_lwe_ciphertext(
+                                            input_batch,
+                                            output_batch,
+                                            accumulator,
+                                            gpu_keys_vec[i].bsk.as_ref().unwrap(),
+                                            local_stream,
+                                        );
+                                    },
+                                )
+                        },
+                        criterion::BatchSize::SmallInput,
+                    );
+                });
+            }
+        };
+
+        let params_record = CryptoParametersRecord {
+            lwe_dimension: Some(input_params.lwe_dimension),
+            glwe_dimension: Some(squash_params.glwe_dimension),
+            polynomial_size: Some(squash_params.polynomial_size),
+            lwe_noise_distribution: Some(lwe_noise_distribution_u64),
+            glwe_noise_distribution: Some(input_params.glwe_noise_distribution),
+            pbs_base_log: Some(squash_params.decomp_base_log),
+            pbs_level: Some(squash_params.decomp_level_count),
+            ciphertext_modulus: Some(input_params.ciphertext_modulus),
+            ..Default::default()
+        };
+
+        let bit_size = (message_modulus as u32).ilog2();
+        write_to_json(
+            &bench_id,
+            params_record,
+            params_name,
+            "pbs",
+            &OperatorType::Atomic,
+            bit_size,
+            vec![bit_size],
+        );
+    }
+
+    fn cuda_multi_bit_pbs_128(c: &mut Criterion) {
+        let bench_name = "core_crypto::cuda::multi_bit_pbs128";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(10)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        type Scalar = u128;
+        let input_params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        let squash_params =
+            NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+        let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
+        let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
+
+        let params_name = "PARAMS_SWITCH_SQUASH";
+
+        let mut boxed_seeder = new_seeder();
+        let seeder = boxed_seeder.as_mut();
+
+        let mut secret_generator =
+            SecretRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed());
+
+        let mut encryption_generator =
+            EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
+
+        let input_lwe_secret_key =
+            LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator);
+
+        let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
+            squash_params.glwe_dimension,
+            squash_params.polynomial_size,
+            &mut secret_generator,
+        );
+
+        let output_lwe_secret_key = output_glwe_secret_key.clone().into_lwe_secret_key();
+
+        let multi_bit_bsk = LweMultiBitBootstrapKey::new(
+            Scalar::ZERO,
+            squash_params.glwe_dimension.to_glwe_size(),
+            squash_params.polynomial_size,
+            squash_params.decomp_base_log,
+            squash_params.decomp_level_count,
+            input_params.lwe_dimension,
+            squash_params.grouping_factor,
+            squash_params.ciphertext_modulus,
+        );
+
+        let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
+            .multi_bit_bootstrap_key(multi_bit_bsk)
+            .build();
+
+        let message_modulus: u64 = 1 << 4;
+        let input_message: u64 = 3;
+        let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
+        let plaintext = Plaintext(input_message * delta);
+
+        let bench_id;
+
+        match get_bench_type() {
+            BenchmarkType::Latency => {
+                let streams = CudaStreams::new_multi_gpu();
+                let gpu_keys = CudaLocalKeys::from_cpu_keys(&cpu_keys, None, &streams);
+
+                let lwe_ciphertext_in: LweCiphertextOwned<u64> =
+                    allocate_and_encrypt_new_lwe_ciphertext(
+                        &input_lwe_secret_key,
+                        plaintext,
+                        lwe_noise_distribution_u64,
+                        ct_modulus_u64,
+                        &mut encryption_generator,
+                    );
+                let lwe_ciphertext_in_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &streams);
+
+                let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
+                    Scalar::ONE,
+                    squash_params.glwe_dimension.to_glwe_size(),
+                    squash_params.polynomial_size,
+                    squash_params.ciphertext_modulus,
+                );
+                let accumulator_gpu =
+                    CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
+
+                let out_pbs_ct = LweCiphertext::new(
+                    Scalar::ZERO,
+                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                    squash_params.ciphertext_modulus,
+                );
+                let mut out_pbs_ct_gpu =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
+
+                let h_indexes = [0];
+                let cuda_indexes = CudaIndexes::new(&h_indexes, &streams, 0);
+
+                bench_id = format!("{bench_name}::{params_name}");
+                {
+                    bench_group.bench_function(&bench_id, |b| {
+                        b.iter(|| {
+                            cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
+                                &lwe_ciphertext_in_gpu,
+                                &mut out_pbs_ct_gpu,
+                                &accumulator_gpu,
+                                &cuda_indexes.d_lut,
+                                &cuda_indexes.d_output,
+                                &cuda_indexes.d_input,
+                                gpu_keys.multi_bit_bsk.as_ref().unwrap(),
+                                &streams,
+                            );
+                            black_box(&mut out_pbs_ct_gpu);
+                        })
+                    });
+                }
+            }
+            BenchmarkType::Throughput => {
+                let gpu_keys_vec = cuda_local_keys_core(&cpu_keys, None);
+                let gpu_count = get_number_of_gpus() as usize;
+
+                bench_id = format!("{bench_name}::throughput::{params_name}");
+                let blocks: usize = 1;
+                let elements = throughput_num_threads(blocks, 1);
+                let elements_per_stream = elements as usize / gpu_count;
+                bench_group.throughput(Throughput::Elements(elements));
+                bench_group.bench_function(&bench_id, |b| {
+                    let setup_encrypted_values = || {
+                        let local_streams = cuda_local_streams_core();
+
+                        let plaintext_list =
+                            PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream));
+
+                        let input_cts = (0..gpu_count)
+                            .map(|i| {
+                                let mut input_ct_list = LweCiphertextList::new(
+                                    u64::ZERO,
+                                    input_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    ct_modulus_u64,
+                                );
+
+                                encrypt_lwe_ciphertext_list(
+                                    &input_lwe_secret_key,
+                                    &mut input_ct_list,
+                                    &plaintext_list,
+                                    lwe_noise_distribution_u64,
+                                    &mut encryption_generator,
+                                );
+
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &input_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let accumulators = (0..gpu_count)
+                            .map(|i| {
+                                let accumulator = GlweCiphertextOwned::new(
+                                    Scalar::ONE,
+                                    squash_params.glwe_dimension.to_glwe_size(),
+                                    squash_params.polynomial_size,
+                                    squash_params.ciphertext_modulus,
+                                );
+                                CudaGlweCiphertextList::from_glwe_ciphertext(
+                                    &accumulator,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        // Allocate the LweCiphertext to store the result of the PBS
+                        let output_cts = (0..gpu_count)
+                            .map(|i| {
+                                let output_ct_list = LweCiphertextList::new(
+                                    Scalar::ZERO,
+                                    output_lwe_secret_key.lwe_dimension().to_lwe_size(),
+                                    LweCiphertextCount(elements_per_stream),
+                                    squash_params.ciphertext_modulus,
+                                );
+                                CudaLweCiphertextList::from_lwe_ciphertext_list(
+                                    &output_ct_list,
+                                    &local_streams[i],
+                                )
+                            })
+                            .collect::<Vec<_>>();
+
+                        let h_indexes = (0..(elements / gpu_count as u64))
+                            .map(CastFrom::cast_from)
+                            .collect::<Vec<_>>();
+                        let cuda_indexes_vec = (0..gpu_count)
+                            .map(|i| CudaIndexes::new(&h_indexes, &local_streams[i], 0))
+                            .collect::<Vec<_>>();
+                        local_streams.iter().for_each(|stream| stream.synchronize());
+
+                        (
+                            input_cts,
+                            output_cts,
+                            accumulators,
+                            cuda_indexes_vec,
+                            local_streams,
+                        )
+                    };
+
+                    b.iter_batched(
+                        setup_encrypted_values,
+                        |(
+                            input_cts,
+                            mut output_cts,
+                            accumulators,
+                            cuda_indexes_vec,
+                            local_streams,
+                        )| {
+                            (0..gpu_count)
+                                .into_par_iter()
+                                .zip(input_cts.par_iter())
+                                .zip(output_cts.par_iter_mut())
+                                .zip(accumulators.par_iter())
+                                .zip(local_streams.par_iter())
+                                .for_each(
+                                    |((((i, input_ct), output_ct), accumulator), local_stream)| {
+                                        cuda_multi_bit_programmable_bootstrap_128_lwe_ciphertext(
                                            input_ct,
                                            output_ct,
                                            accumulator,
-                                            LweCiphertextCount(1),
-                                            gpu_keys_vec[i].bsk.as_ref().unwrap(),
+                                            &cuda_indexes_vec[i].d_lut,
+                                            &cuda_indexes_vec[i].d_output,
+                                            &cuda_indexes_vec[i].d_input,
+                                            gpu_keys_vec[i].multi_bit_bsk.as_ref().unwrap(),
                                            local_stream,
                                        );
                                    },
@@ -444,10 +709,15 @@ mod cuda {
        let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
        cuda_pbs_128(&mut criterion);
    }
+
+    pub fn cuda_multi_bit_pbs128_group() {
+        let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
+        cuda_multi_bit_pbs_128(&mut criterion);
+    }
 }

 #[cfg(feature = "gpu")]
-use cuda::cuda_pbs128_group;
+use cuda::{cuda_multi_bit_pbs128_group, cuda_pbs128_group};

 pub fn pbs128_group() {
    let mut criterion: Criterion<_> = Criterion::default().configure_from_args();
@@ -457,6 +727,7 @@ pub fn pbs128_group() {
 #[cfg(feature = "gpu")]
 fn go_through_gpu_bench_groups() {
    cuda_pbs128_group();
+    cuda_multi_bit_pbs128_group();
 }

 #[cfg(not(feature = "gpu"))]
--- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
+++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs
@@ -1031,7 +1031,6 @@ mod cuda {
                                    &cuda_indexes.d_lut,
                                    &cuda_indexes.d_output,
                                    &cuda_indexes.d_input,
-                                    LweCiphertextCount(1),
                                    gpu_keys.bsk.as_ref().unwrap(),
                                    &streams,
                                );
@@ -1113,7 +1112,7 @@ mod cuda {
                                })
                                .collect::<Vec<_>>();

-                            let h_indexes = (0..(elements / gpu_count as u64))
+                            let h_indexes = (0..elements_per_stream as u64)
                                .map(CastFrom::cast_from)
                                .collect::<Vec<_>>();
                            let cuda_indexes_vec = (0..gpu_count)
@@ -1157,7 +1156,6 @@ mod cuda {
                                                &cuda_indexes_vec[i].d_lut,
                                                &cuda_indexes_vec[i].d_output,
                                                &cuda_indexes_vec[i].d_input,
-                                                LweCiphertextCount(1),
                                                gpu_keys_vec[i].bsk.as_ref().unwrap(),
                                                local_stream,
                                            );
--- a/tfhe-benchmark/benches/high_level_api/erc20.rs
+++ b/tfhe-benchmark/benches/high_level_api/erc20.rs
@@ -71,7 +71,7 @@ where

 /// This one also uses a comparison, but it leverages the 'boolean' multiplication
 /// instead of cmuxes, so it is faster
-#[cfg(not(feature = "hpu"))]
+#[cfg(all(feature = "gpu", not(feature = "hpu")))]
 fn transfer_no_cmux<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
@@ -87,6 +87,29 @@ where

    let amount = amount * FheType::cast_from(has_enough_funds);

+    let new_to_amount = to_amount + &amount;
+    let new_from_amount = from_amount - &amount;
+
+    (new_from_amount, new_to_amount)
+}
+
+/// Parallel variant of [`transfer_no_cmux`].
+#[cfg(not(feature = "hpu"))]
+fn par_transfer_no_cmux<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: Add<Output = FheType> + CastFrom<FheBool> + for<'a> FheOrd<&'a FheType> + Send + Sync,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType:
+        Add<Output = FheType> + Sub<Output = FheType> + Mul<FheType, Output = FheType>,
+{
+    let has_enough_funds = (from_amount).ge(amount);
+
+    let amount = amount * FheType::cast_from(has_enough_funds);
+
    let (new_to_amount, new_from_amount) =
        rayon::join(|| to_amount + &amount, || from_amount - &amount);

@@ -95,12 +118,36 @@ where

 /// This one uses overflowing sub to remove the need for comparison
 /// it also uses the 'boolean' multiplication
-#[cfg(not(feature = "hpu"))]
+#[cfg(all(feature = "gpu", not(feature = "hpu")))]
 fn transfer_overflow<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
    amount: &FheType,
 ) -> (FheType, FheType)
+where
+    FheType: CastFrom<FheBool> + for<'a> FheOrd<&'a FheType> + Send + Sync,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<FheType, Output = FheType>
+        + OverflowingSub<&'a FheType, Output = FheType>
+        + Mul<FheType, Output = FheType>,
+{
+    let (new_from, did_not_have_enough) = (from_amount).overflowing_sub(amount);
+
+    let new_from_amount = did_not_have_enough.if_then_else(from_amount, &new_from);
+
+    let had_enough_funds = !did_not_have_enough;
+    let new_to_amount = to_amount + (amount * FheType::cast_from(had_enough_funds));
+
+    (new_from_amount, new_to_amount)
+}
+
+/// Parallel variant of [`transfer_overflow`].
+#[cfg(not(feature = "hpu"))]
+fn par_transfer_overflow<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
 where
    FheType: CastFrom<FheBool> + for<'a> FheOrd<&'a FheType> + Send + Sync,
    FheBool: IfThenElse<FheType>,
@@ -122,12 +169,36 @@ where

 /// This ones uses both overflowing_add/sub to check that both
 /// the sender has enough funds, and the receiver will not overflow its balance
-#[cfg(not(feature = "hpu"))]
+#[cfg(all(feature = "gpu", not(feature = "hpu")))]
 fn transfer_safe<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
    amount: &FheType,
 ) -> (FheType, FheType)
+where
+    FheType: Send + Sync,
+    for<'a> &'a FheType: OverflowingSub<&'a FheType, Output = FheType>
+        + OverflowingAdd<&'a FheType, Output = FheType>,
+    FheBool: IfThenElse<FheType>,
+{
+    let (new_from, did_not_have_enough_funds) = (from_amount).overflowing_sub(amount);
+    let (new_to, did_not_have_enough_space) = (to_amount).overflowing_add(amount);
+
+    let something_not_ok = did_not_have_enough_funds | did_not_have_enough_space;
+
+    let new_from_amount = something_not_ok.if_then_else(from_amount, &new_from);
+    let new_to_amount = something_not_ok.if_then_else(to_amount, &new_to);
+
+    (new_from_amount, new_to_amount)
+}
+
+/// Parallel variant of [`transfer_safe`].
+#[cfg(not(feature = "hpu"))]
+fn par_transfer_safe<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
 where
    FheType: Send + Sync,
    for<'a> &'a FheType: OverflowingSub<&'a FheType, Output = FheType>
@@ -358,71 +429,69 @@ fn cuda_bench_transfer_throughput<FheType, F>(
        .map(|i| compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32)))
        .collect::<Vec<_>>();

-    for num_elems in [10 * num_gpus, 100 * num_gpus, 500 * num_gpus] {
-        group.throughput(Throughput::Elements(num_elems));
-        let bench_id =
-            format!("{bench_name}::throughput::{fn_name}::{type_name}::{num_elems}_elems");
-        group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
-            let from_amounts = (0..num_elems)
-                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
-                .collect::<Vec<_>>();
-            let to_amounts = (0..num_elems)
-                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
-                .collect::<Vec<_>>();
-            let amounts = (0..num_elems)
-                .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
-                .collect::<Vec<_>>();
+    // 200 * num_gpus seems to be enough for maximum throughput on 8xH100 SXM5
+    let num_elems = 200 * num_gpus;

-            let num_streams_per_gpu = 8; // Hard coded stream value for FheUint64
-            let chunk_size = (num_elems / num_gpus) as usize;
+    group.throughput(Throughput::Elements(num_elems));
+    let bench_id = format!("{bench_name}::throughput::{fn_name}::{type_name}::{num_elems}_elems");
+    group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| {
+        let from_amounts = (0..num_elems)
+            .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+            .collect::<Vec<_>>();
+        let to_amounts = (0..num_elems)
+            .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+            .collect::<Vec<_>>();
+        let amounts = (0..num_elems)
+            .map(|_| FheType::encrypt(rng.gen::<u64>(), client_key))
+            .collect::<Vec<_>>();

-            b.iter(|| {
-                from_amounts
-                    .par_chunks(chunk_size) // Split into chunks of num_gpus
-                    .zip(
-                        to_amounts
-                            .par_chunks(chunk_size)
-                            .zip(amounts.par_chunks(chunk_size)),
-                    ) // Zip with the other data
-                    .enumerate() // Get the index for GPU
-                    .for_each(
-                        |(i, (from_amount_gpu_i, (to_amount_gpu_i, amount_gpu_i)))| {
-                            // Process chunks within each GPU
-                            let stream_chunk_size = from_amount_gpu_i.len() / num_streams_per_gpu;
-                            from_amount_gpu_i
-                                .par_chunks(stream_chunk_size)
-                                .zip(to_amount_gpu_i.par_chunks(stream_chunk_size))
-                                .zip(amount_gpu_i.par_chunks(stream_chunk_size))
-                                .for_each(
-                                    |((from_amount_chunk, to_amount_chunk), amount_chunk)| {
-                                        // Set the server key for the current GPU
-                                        set_server_key(sks_vec[i].clone());
-                                        // Parallel iteration over the chunks of data
-                                        from_amount_chunk
-                                            .iter()
-                                            .zip(to_amount_chunk.iter().zip(amount_chunk.iter()))
-                                            .for_each(|(from_amount, (to_amount, amount))| {
-                                                transfer_func(from_amount, to_amount, amount);
-                                            });
-                                    },
-                                );
-                        },
-                    );
-            });
+        let num_streams_per_gpu = 8; // Hard coded stream value for FheUint64
+        let chunk_size = (num_elems / num_gpus) as usize;
+
+        b.iter(|| {
+            from_amounts
+                .par_chunks(chunk_size) // Split into chunks of num_gpus
+                .zip(
+                    to_amounts
+                        .par_chunks(chunk_size)
+                        .zip(amounts.par_chunks(chunk_size)),
+                ) // Zip with the other data
+                .enumerate() // Get the index for GPU
+                .for_each(
+                    |(i, (from_amount_gpu_i, (to_amount_gpu_i, amount_gpu_i)))| {
+                        // Process chunks within each GPU
+                        let stream_chunk_size = from_amount_gpu_i.len() / num_streams_per_gpu;
+                        from_amount_gpu_i
+                            .par_chunks(stream_chunk_size)
+                            .zip(to_amount_gpu_i.par_chunks(stream_chunk_size))
+                            .zip(amount_gpu_i.par_chunks(stream_chunk_size))
+                            .for_each(|((from_amount_chunk, to_amount_chunk), amount_chunk)| {
+                                // Set the server key for the current GPU
+                                set_server_key(sks_vec[i].clone());
+                                // Parallel iteration over the chunks of data
+                                from_amount_chunk
+                                    .iter()
+                                    .zip(to_amount_chunk.iter().zip(amount_chunk.iter()))
+                                    .for_each(|(from_amount, (to_amount, amount))| {
+                                        transfer_func(from_amount, to_amount, amount);
+                                    });
+                            });
+                    },
+                );
        });
+    });

-        let params = client_key.computation_parameters();
+    let params = client_key.computation_parameters();

-        write_to_json::<u64, _>(
-            &bench_id,
-            params,
-            params.name(),
-            "erc20-transfer",
-            &OperatorType::Atomic,
-            64,
-            vec![],
-        );
-    }
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "erc20-transfer",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
 }

 #[cfg(feature = "hpu")]
@@ -517,14 +586,19 @@ fn main() {
            "transfer::whitepaper",
            par_transfer_whitepaper::<FheUint64>,
        );
-        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
+        print_transfer_pbs_counts(
+            &cks,
+            "FheUint64",
+            "no_cmux",
+            par_transfer_no_cmux::<FheUint64>,
+        );
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
            "transfer::overflow",
-            transfer_overflow::<FheUint64>,
+            par_transfer_overflow::<FheUint64>,
        );
-        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
+        print_transfer_pbs_counts(&cks, "FheUint64", "safe", par_transfer_safe::<FheUint64>);
    }

    // FheUint64 latency
@@ -544,7 +618,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::no_cmux",
-            transfer_no_cmux::<FheUint64>,
+            par_transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_latency(
            &mut group,
@@ -552,7 +626,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::overflow",
-            transfer_overflow::<FheUint64>,
+            par_transfer_overflow::<FheUint64>,
        );
        bench_transfer_latency(
            &mut group,
@@ -560,7 +634,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::safe",
-            transfer_safe::<FheUint64>,
+            par_transfer_safe::<FheUint64>,
        );

        group.finish();
@@ -583,7 +657,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::no_cmux",
-            transfer_no_cmux::<FheUint64>,
+            par_transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_throughput(
            &mut group,
@@ -591,7 +665,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::overflow",
-            transfer_overflow::<FheUint64>,
+            par_transfer_overflow::<FheUint64>,
        );
        bench_transfer_throughput(
            &mut group,
@@ -599,7 +673,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::safe",
-            transfer_safe::<FheUint64>,
+            par_transfer_safe::<FheUint64>,
        );

        group.finish();
@@ -631,14 +705,19 @@ fn main() {
            "transfer::whitepaper",
            par_transfer_whitepaper::<FheUint64>,
        );
-        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
+        print_transfer_pbs_counts(
+            &cks,
+            "FheUint64",
+            "no_cmux",
+            par_transfer_no_cmux::<FheUint64>,
+        );
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
            "transfer::overflow",
-            transfer_overflow::<FheUint64>,
+            par_transfer_overflow::<FheUint64>,
        );
-        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
+        print_transfer_pbs_counts(&cks, "FheUint64", "safe", par_transfer_safe::<FheUint64>);
    }

    // FheUint64 latency
@@ -658,7 +737,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::no_cmux",
-            transfer_no_cmux::<FheUint64>,
+            par_transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_latency(
            &mut group,
@@ -666,7 +745,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::overflow",
-            transfer_overflow::<FheUint64>,
+            par_transfer_overflow::<FheUint64>,
        );
        bench_transfer_latency(
            &mut group,
@@ -674,7 +753,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::safe",
-            transfer_safe::<FheUint64>,
+            par_transfer_safe::<FheUint64>,
        );

        group.finish();
@@ -689,7 +768,7 @@ fn main() {
            bench_name,
            "FheUint64",
            "transfer::whitepaper",
-            par_transfer_whitepaper::<FheUint64>,
+            transfer_whitepaper::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
            &mut group,
--- a/tfhe-benchmark/benches/high_level_api/noise_squash.rs
+++ b/tfhe-benchmark/benches/high_level_api/noise_squash.rs
@@ -0,0 +1,216 @@
+#[cfg(feature = "gpu")]
+use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(not(feature = "gpu"))]
+use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(feature = "gpu")]
+use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+#[cfg(feature = "gpu")]
+use benchmark::utilities::configure_gpu;
+use benchmark::utilities::{
+    get_bench_type, throughput_num_threads, write_to_json, BenchmarkType, OperatorType,
+};
+use criterion::{Criterion, Throughput};
+use rand::prelude::*;
+use rand::thread_rng;
+use rayon::prelude::*;
+use tfhe::keycache::NamedParam;
+use tfhe::prelude::*;
+
+#[cfg(feature = "gpu")]
+use tfhe::core_crypto::gpu::get_number_of_gpus;
+#[cfg(feature = "gpu")]
+use tfhe::{set_server_key, GpuIndex};
+use tfhe::{
+    ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
+    FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
+};
+
+fn bench_fhe_type<FheType>(
+    c: &mut Criterion,
+    client_key: &ClientKey,
+    type_name: &str,
+    num_bits: usize,
+) where
+    FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
+    FheType: SquashNoise,
+{
+    let mut bench_group = c.benchmark_group(type_name);
+    let bench_id_prefix = if cfg!(feature = "gpu") {
+        "hlapi::cuda"
+    } else {
+        "hlapi"
+    };
+    let bench_id_suffix = format!("noise_squash::{type_name}");
+
+    let mut rng = thread_rng();
+
+    let bench_id;
+
+    match get_bench_type() {
+        BenchmarkType::Latency => {
+            bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
+
+            #[cfg(feature = "gpu")]
+            configure_gpu(client_key);
+
+            let input = FheType::encrypt(rng.gen(), client_key);
+
+            bench_group.bench_function(&bench_id, |b| {
+                b.iter(|| {
+                    let _ = input.squash_noise();
+                })
+            });
+        }
+        BenchmarkType::Throughput => {
+            bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
+            let params = client_key.computation_parameters();
+            let num_blocks = num_bits
+                .div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
+
+            #[cfg(feature = "gpu")]
+            {
+                let elements = throughput_num_threads(num_blocks, 4);
+                bench_group.throughput(Throughput::Elements(elements));
+                println!("elements: {elements}");
+                let gpu_count = get_number_of_gpus() as usize;
+                let compressed_server_key = CompressedServerKey::new(client_key);
+                let sks_vec = (0..gpu_count)
+                    .map(|i| {
+                        compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
+                    })
+                    .collect::<Vec<_>>();
+
+                bench_group.bench_function(&bench_id, |b| {
+                    let encrypt_values = || {
+                        (0..elements)
+                            .map(|_| FheType::encrypt(rng.gen(), client_key))
+                            .collect::<Vec<_>>()
+                    };
+
+                    b.iter_batched(
+                        encrypt_values,
+                        |inputs| {
+                            inputs.par_iter().enumerate().for_each(|(i, input)| {
+                                set_server_key(sks_vec[i % gpu_count].clone());
+
+                                let _ = input.squash_noise();
+                            })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+            }
+
+            #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+            {
+                let elements = throughput_num_threads(num_blocks, 1);
+                bench_group.throughput(Throughput::Elements(elements));
+                println!("elements: {elements}");
+                bench_group.bench_function(&bench_id, |b| {
+                    let encrypt_values = || {
+                        (0..elements)
+                            .map(|_| FheType::encrypt(rng.gen(), client_key))
+                            .collect::<Vec<_>>()
+                    };
+
+                    b.iter_batched(
+                        encrypt_values,
+                        |inputs| {
+                            inputs.par_iter().for_each(|input| {
+                                let _ = input.squash_noise();
+                            })
+                        },
+                        criterion::BatchSize::SmallInput,
+                    )
+                });
+            }
+        }
+    }
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "noise_squash",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+macro_rules! bench_type {
+    ($fhe_type:ident) => {
+        ::paste::paste! {
+            fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
+                bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            }
+        }
+    };
+}
+
+bench_type!(FheUint2);
+bench_type!(FheUint4);
+bench_type!(FheUint6);
+bench_type!(FheUint8);
+bench_type!(FheUint10);
+bench_type!(FheUint12);
+bench_type!(FheUint14);
+bench_type!(FheUint16);
+bench_type!(FheUint32);
+bench_type!(FheUint64);
+bench_type!(FheUint128);
+
+fn main() {
+    #[cfg(feature = "hpu")]
+    panic!("Noise squashing is not supported on HPU");
+    #[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
+    let cks = {
+        use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+        use tfhe::{set_server_key, ConfigBuilder};
+        let config = ConfigBuilder::with_custom_parameters(
+            BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
+        .build();
+        let cks = ClientKey::generate(config);
+        let compressed_sks = CompressedServerKey::new(&cks);
+
+        let decompressed_sks = compressed_sks.decompress();
+        rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
+        set_server_key(decompressed_sks);
+        cks
+    };
+    #[cfg(feature = "gpu")]
+    let cks = {
+        use tfhe::{set_server_key, ConfigBuilder};
+        let config = ConfigBuilder::with_custom_parameters(
+            BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .enable_noise_squashing(
+            BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
+        )
+        .build();
+        let cks = ClientKey::generate(config);
+        let compressed_sks = CompressedServerKey::new(&cks);
+
+        set_server_key(compressed_sks.decompress_to_gpu());
+        cks
+    };
+
+    let mut c = Criterion::default().configure_from_args();
+
+    bench_fhe_uint2(&mut c, &cks);
+    bench_fhe_uint4(&mut c, &cks);
+    bench_fhe_uint6(&mut c, &cks);
+    bench_fhe_uint8(&mut c, &cks);
+    bench_fhe_uint10(&mut c, &cks);
+    bench_fhe_uint12(&mut c, &cks);
+    bench_fhe_uint14(&mut c, &cks);
+    bench_fhe_uint16(&mut c, &cks);
+    bench_fhe_uint32(&mut c, &cks);
+    bench_fhe_uint64(&mut c, &cks);
+    bench_fhe_uint128(&mut c, &cks);
+
+    c.final_summary();
+}
--- a/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
+++ b/tfhe-benchmark/benches/integer/glwe_packing_compression.rs
@@ -159,6 +159,7 @@ fn cpu_glwe_packing(c: &mut Criterion) {
 mod cuda {
    use super::*;
    use benchmark::utilities::cuda_integer_utils::cuda_local_streams;
+    use itertools::Itertools;
    use std::cmp::max;
    use tfhe::core_crypto::gpu::CudaStreams;
    use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
@@ -203,18 +204,20 @@ mod cuda {
            let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
            let (compressed_compression_key, compressed_decompression_key) =
                radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
-            let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
-            let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
-                radix_cks.parameters().glwe_dimension(),
-                radix_cks.parameters().polynomial_size(),
-                radix_cks.parameters().message_modulus(),
-                radix_cks.parameters().carry_modulus(),
-                radix_cks.parameters().ciphertext_modulus(),
-                &stream,
-            );

            match get_bench_type() {
                BenchmarkType::Latency => {
+                    let cuda_compression_key =
+                        compressed_compression_key.decompress_to_cuda(&stream);
+                    let cuda_decompression_key = compressed_decompression_key.decompress_to_cuda(
+                        radix_cks.parameters().glwe_dimension(),
+                        radix_cks.parameters().polynomial_size(),
+                        radix_cks.parameters().message_modulus(),
+                        radix_cks.parameters().carry_modulus(),
+                        radix_cks.parameters().ciphertext_modulus(),
+                        &stream,
+                    );
+
                    // Encrypt
                    let ct = cks.encrypt_radix(0_u32, num_blocks);
                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
@@ -268,59 +271,84 @@ mod cuda {
                    bench_group.throughput(Throughput::Elements(elements));

                    // Encrypt
-                    let ct = cks.encrypt_radix(0_u32, num_blocks);
-                    let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+                    let local_streams = cuda_local_streams(num_block, elements as usize);
+
+                    let cuda_compression_key_vec = local_streams
+                        .iter()
+                        .map(|local_stream| {
+                            compressed_compression_key.decompress_to_cuda(local_stream)
+                        })
+                        .collect_vec();
+                    let cuda_decompression_key_vec = local_streams
+                        .iter()
+                        .map(|local_stream| {
+                            compressed_decompression_key.decompress_to_cuda(
+                                radix_cks.parameters().glwe_dimension(),
+                                radix_cks.parameters().polynomial_size(),
+                                radix_cks.parameters().message_modulus(),
+                                radix_cks.parameters().carry_modulus(),
+                                radix_cks.parameters().ciphertext_modulus(),
+                                local_stream,
+                            )
+                        })
+                        .collect_vec();

                    // Benchmark
-                    let mut builder = CudaCompressedCiphertextListBuilder::new();
-
-                    builder.push(d_ct, &stream);
-
                    let builders = (0..elements)
-                        .map(|_| {
+                        .map(|i| {
                            let ct = cks.encrypt_radix(0_u32, num_blocks);
-                            let d_ct =
-                                CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+                            let local_stream = &local_streams[i as usize % local_streams.len()];
+                            let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                                &ct,
+                                local_stream,
+                            );
                            let mut builder = CudaCompressedCiphertextListBuilder::new();
-                            builder.push(d_ct, &stream);
+                            builder.push(d_ct, local_stream);

                            builder
                        })
                        .collect::<Vec<_>>();

-                    let local_streams = cuda_local_streams(num_block, elements as usize);
-
                    bench_id_pack = format!("{bench_name}::throughput::pack_u{bit_size}");
                    bench_group.bench_function(&bench_id_pack, |b| {
                        b.iter(|| {
-                            builders.par_iter().zip(local_streams.par_iter()).for_each(
-                                |(builder, local_stream)| {
-                                    builder.build(&cuda_compression_key, local_stream);
-                                },
-                            )
+                            builders.par_iter().enumerate().for_each(|(i, builder)| {
+                                let local_stream = &local_streams[i % local_streams.len()];
+                                let cuda_compression_key =
+                                    &cuda_compression_key_vec[i % local_streams.len()];
+
+                                builder.build(cuda_compression_key, local_stream);
+                            })
                        })
                    });

                    let compressed = builders
                        .iter()
-                        .map(|builder| builder.build(&cuda_compression_key, &stream))
+                        .enumerate()
+                        .map(|(i, builder)| {
+                            let local_stream = &local_streams[i % local_streams.len()];
+                            let cuda_compression_key =
+                                &cuda_compression_key_vec[i % local_streams.len()];
+                            builder.build(cuda_compression_key, local_stream)
+                        })
                        .collect::<Vec<_>>();

                    bench_id_unpack = format!("{bench_name}::throughput::unpack_u{bit_size}");
                    bench_group.bench_function(&bench_id_unpack, |b| {
                        b.iter(|| {
-                            compressed
-                                .par_iter()
-                                .zip(local_streams.par_iter())
-                                .for_each(|(comp, local_stream)| {
-                                    comp.get::<CudaUnsignedRadixCiphertext>(
-                                        0,
-                                        &cuda_decompression_key,
-                                        local_stream,
-                                    )
-                                    .unwrap()
-                                    .unwrap();
-                                })
+                            compressed.par_iter().enumerate().for_each(|(i, comp)| {
+                                let local_stream = &local_streams[i % local_streams.len()];
+                                let cuda_decompression_key =
+                                    &cuda_decompression_key_vec[i % local_streams.len()];
+
+                                comp.get::<CudaUnsignedRadixCiphertext>(
+                                    0,
+                                    cuda_decompression_key,
+                                    local_stream,
+                                )
+                                .unwrap()
+                                .unwrap();
+                            })
                        })
                    });
                }
--- a/tfhe-benchmark/benches/integer/zk_pke.rs
+++ b/tfhe-benchmark/benches/integer/zk_pke.rs
@@ -421,8 +421,6 @@ mod cuda {
            .sample_size(15)
            .measurement_time(std::time::Duration::from_secs(60));

-        let streams = CudaStreams::new_multi_gpu();
-
        File::create(results_file).expect("create results file failed");
        let mut file = OpenOptions::new()
            .append(true)
@@ -439,17 +437,10 @@ mod cuda {
            let cks = ClientKey::new(param_fhe);
            let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
            let sk = compressed_server_key.decompress();
-            let gpu_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);

            let compact_private_key = CompactPrivateKey::new(param_pke);
            let pk = CompactPublicKey::new(&compact_private_key);
            let ksk = KeySwitchingKey::new((&compact_private_key, None), (&cks, &sk), param_ksk);
-            let d_ksk_material =
-                CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
-            let d_ksk = CudaKeySwitchingKey::from_cuda_key_switching_key_material(
-                &d_ksk_material,
-                &gpu_sks,
-            );

            // We have a use case with 320 bits of metadata
            let mut metadata = [0u8; (320 / u8::BITS) as usize];
@@ -509,6 +500,18 @@ mod cuda {

                    match get_bench_type() {
                        BenchmarkType::Latency => {
+                            let streams = CudaStreams::new_multi_gpu();
+                            let gpu_sks = CudaServerKey::decompress_from_cpu(
+                                &compressed_server_key,
+                                &streams,
+                            );
+                            let d_ksk_material =
+                                CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
+                            let d_ksk = CudaKeySwitchingKey::from_cuda_key_switching_key_material(
+                                &d_ksk_material,
+                                &gpu_sks,
+                            );
+
                            bench_id_verify = format!(
                            "{bench_name}::{param_name}_{bits}_bits_packed_{zk_load}_ZK{zk_vers:?}"
                        );
@@ -599,9 +602,12 @@ mod cuda {
                            });
                        }
                        BenchmarkType::Throughput => {
-                            let gpu_count = get_number_of_gpus() as usize;
-
-                            let elements = zk_throughput_num_elements();
+                            let mut elements_per_gpu = 100;
+                            if bits == 4096 {
+                                elements_per_gpu /= 5;
+                            }
+                            // This value, found empirically, ensure saturation of 8XH100 SXM5
+                            let elements = elements_per_gpu * get_number_of_gpus() as u64;
                            bench_group.throughput(Throughput::Elements(elements));

                            bench_id_verify = format!(
@@ -636,8 +642,6 @@ mod cuda {
                                })
                                .collect::<Vec<_>>();

-                            assert_eq!(d_ksk_material_vec.len(), gpu_count);
-
                            bench_group.bench_function(&bench_id_verify, |b| {
                                b.iter(|| {
                                    cts.par_iter().for_each(|ct1| {
@@ -648,23 +652,25 @@ mod cuda {

                            bench_group.bench_function(&bench_id_expand_without_verify, |b| {
                                    let setup_encrypted_values = || {
-                                        let local_streams = cuda_local_streams(num_block, elements as usize);
-
                                        let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
+                                            let local_stream = &local_streams[i % local_streams.len()];
                                            CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
-                                                ct, &local_streams[i],
+                                                ct, local_stream,
                                            )
                                        }).collect_vec();

-                                        (gpu_cts, local_streams)
+                                        gpu_cts
                                    };

                                b.iter_batched(setup_encrypted_values,
-                                               |(gpu_cts, local_streams)| {
-                                               gpu_cts.par_iter().zip(local_streams.par_iter()).enumerate().for_each
-                                               (|(i, (gpu_ct, local_stream))| {
+                                               |gpu_cts| {
+                                               gpu_cts.par_iter().enumerate().for_each
+                                               (|(i, gpu_ct)| {
+                                                   let local_stream = &local_streams[i % local_streams.len()];
+
+            let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
                                                   let d_ksk =
-                CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % gpu_count], &gpu_sks);
+                CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);

                                        gpu_ct
                                            .expand_without_verification(&d_ksk, local_stream)
@@ -675,21 +681,24 @@ mod cuda {

                            bench_group.bench_function(&bench_id_verify_and_expand, |b| {
                                    let setup_encrypted_values = || {
-                                        let local_streams = cuda_local_streams(num_block, elements as usize);
-
                                        let gpu_cts = cts.iter().enumerate().map(|(i, ct)| {
                                            CudaProvenCompactCiphertextList::from_proven_compact_ciphertext_list(
-                                                ct, &local_streams[i],
+                                                ct, &local_streams[i% local_streams.len()],
                                            )
                                        }).collect_vec();

-                                        (gpu_cts, local_streams)
+                                        gpu_cts
                                    };

                                b.iter_batched(setup_encrypted_values,
-                                               |(gpu_cts, local_streams)| {
-                                               gpu_cts.par_iter().zip(local_streams.par_iter()).for_each
-                                               (|(gpu_ct, local_stream)| {
+                                               |gpu_cts| {
+                                               gpu_cts.par_iter().enumerate().for_each
+                                               (|(i, gpu_ct)| {
+                                                   let local_stream = &local_streams[i % local_streams.len()];
+            let gpu_sk = CudaServerKey::decompress_from_cpu(&compressed_server_key, local_stream);
+                                                   let d_ksk =
+                CudaKeySwitchingKey::from_cuda_key_switching_key_material(&d_ksk_material_vec[i % local_streams.len()], &gpu_sk);
+
                                        gpu_ct
                                            .verify_and_expand(
                                                &crs, &pk, &metadata, &d_ksk, local_stream,
--- a/tfhe-benchmark/src/params_aliases.rs
+++ b/tfhe-benchmark/src/params_aliases.rs
@@ -139,6 +139,10 @@ pub mod shortint_params_aliases {
        NoiseSquashingParameters =
        V1_3_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

+    pub const BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
+        NoiseSquashingParameters =
+        V1_3_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
    #[cfg(feature = "hpu")]
    // KS PBS Gaussian for Hpu
    pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64: KeySwitch32PBSParameters =
--- a/tfhe-benchmark/src/utilities.rs
+++ b/tfhe-benchmark/src/utilities.rs
@@ -311,9 +311,9 @@ pub fn write_to_json<
 }

 const FAST_BENCH_BIT_SIZES: [usize; 1] = [64];
-const BENCH_BIT_SIZES: [usize; 8] = [4, 8, 16, 32, 40, 64, 128, 256];
+const BENCH_BIT_SIZES: [usize; 7] = [8, 16, 32, 40, 64, 128, 256];
 const HPU_BENCH_BIT_SIZES: [usize; 5] = [8, 16, 32, 64, 128];
-const MULTI_BIT_CPU_SIZES: [usize; 6] = [4, 8, 16, 32, 40, 64];
+const MULTI_BIT_CPU_SIZES: [usize; 5] = [8, 16, 32, 40, 64];

 /// User configuration in which benchmarks must be run.
 #[derive(Default)]
@@ -397,8 +397,8 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
        let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
        let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
        let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
-        elements.min(1500) // This threshold is useful for operation with both a small number of
-                           // block and low PBs count.
+        elements.min(200) // This threshold is useful for operation with both a small number of
+                          // block and low PBs count.
    }
    #[cfg(feature = "hpu")]
    {
@@ -521,7 +521,7 @@ mod cuda_utils {
        pub ksk: Option<CudaLweKeyswitchKey<T>>,
        pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
        pub bsk: Option<CudaLweBootstrapKey>,
-        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey>,
+        pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
    }

    #[allow(dead_code)]
@@ -579,10 +579,12 @@ mod cuda_utils {
            let mut d_input = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
            let mut d_output = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
            let mut d_lut = unsafe { CudaVec::<T>::new_async(length, stream, stream_index) };
+            let zeros = vec![T::ZERO; length];
+
            unsafe {
                d_input.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
                d_output.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
-                d_lut.copy_from_cpu_async(indexes.as_ref(), stream, stream_index);
+                d_lut.copy_from_cpu_async(zeros.as_ref(), stream, stream_index);
            }
            stream.synchronize();

--- a/tfhe/docs/.gitbook/assets/carry.png
+++ b/tfhe/docs/.gitbook/assets/carry.png
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Agnes Leroy	ba99b2feda	print	2025-07-16 11:18:32 +02:00
Agnes Leroy	b0dfd5609c	debug	2025-07-16 11:13:44 +02:00
Agnes Leroy	5083efcdc9	debug	2025-07-16 11:09:29 +02:00
Agnes Leroy	2d676fe0cb	debug	2025-07-16 11:04:00 +02:00
Agnes Leroy	ec18468ab4	debug	2025-07-16 10:56:45 +02:00
Agnes Leroy	1541281769	debug	2025-07-16 10:52:44 +02:00
Agnes Leroy	7e15535bda	print	2025-07-16 10:46:10 +02:00
Agnes Leroy	35ee34da1b	print	2025-07-15 16:22:43 +02:00
Agnes Leroy	589528ffd4	print	2025-07-15 16:12:18 +02:00
Agnes Leroy	dfcc37ec98	lower threshold for testing	2025-07-15 15:55:20 +02:00
Agnes Leroy	0e56b47903	print	2025-07-15 15:54:15 +02:00
Agnes Leroy	9fac4b81d1	print	2025-07-15 12:03:22 +02:00
Agnes Leroy	c26637e6fa	debug	2025-07-15 11:48:17 +02:00
Agnes Leroy	3bf603ae1e	chore(gpu): change multi gpu logic	2025-07-15 11:31:04 +02:00
Agnes Leroy	0d41b4f445	chore(gpu): add bench command for cuda and update weekly bench	2025-07-11 14:04:32 +01:00
Agnes Leroy	068cbc0f41	chore(gpu): add hl api noise squash latency and throughput bench	2025-07-11 14:04:32 +01:00
Agnes Leroy	f8947ddff3	chore(gpu): remove nightly schedule now that ci is lighter	2025-07-11 12:43:36 +01:00
Pedro Alves	1b98312e2c	fix(gpu): fix regression on ERC20 throughput - partially revert changes done in `fd79c4f972` - transfers for the GPU case should be measured using sequential operations (without rayon!)	2025-07-11 08:57:19 +01:00
Pedro Alves	d3dd010deb	fix(gpu): reduces number of elements in the ZK throughput benchmark	2025-07-11 08:57:01 +01:00
Agnes Leroy	15762623d1	chore(gpu): minor refactor in sum ctxt	2025-07-10 16:24:02 +01:00
Beka Barbakadze	c6865ab880	fix(gpu): fix pbs128 multi-gpu bug Signed-off-by: Beka Barbakadze <beka.barbakadze@zama.ai>	2025-07-10 15:54:27 +01:00
Enzo Di Maria	e376df2fa4	refactor(gpu): moving unsigned_scalar_div_rem and signed_scalar_div_rem to the backend	2025-07-10 09:24:13 +02:00
Arthur Meyre	bd739c2d48	chore(docs): uniformize paths in docs to use "-" instead of "_" - this is to avoid conflicts with gitbook	2025-07-09 14:36:04 +02:00
Pedro Alves	9960f5e8b6	fix(gpu): Fix expand bench on multi-gpus	2025-07-09 09:17:55 +01:00
Nicolas Sarlin	776f08b534	chore(ci): remove close_data_pr workflow	2025-07-09 09:31:29 +02:00
David Testé	ac13eed3b1	chore(ci): allow git lfs sync between repositories Since integration of HPU backend, some Git LFS references need to be synced along with the rest of the codebase. The usage of valtech-sd/git-sync action, which is a fork of wei/git-sync, allows to push git lfs reference to another repository.	2025-07-09 09:07:48 +02:00
Arthur Meyre	17d3a492b6	chore: only run backward compat clippy on x86 machines - older versions of the crates are only compilable with x86, disable on arm for now - revisit when the crates are split ?	2025-07-09 08:29:12 +02:00
Enzo Di Maria	ba87f1ba5e	chore(gpu): removing useless arguments	2025-07-08 16:17:51 +02:00
Nicolas Sarlin	c70ad3374e	chore(ci): allow workflows to run concurrently on main	2025-07-08 09:57:25 +02:00
Nicolas Sarlin	c7ec835e5f	chore: adds params_to_file for noise squashing compression	2025-07-07 17:31:28 +02:00
Agnes Leroy	075b2259d3	chore(gpu): reduce ci time by reducing testing of unused parameters	2025-07-07 16:30:35 +01:00
Pedro Alves	23ebd42209	fix(gpu): fix compression throughput benchmark	2025-07-07 16:30:24 +01:00
Nicolas Sarlin	bb1ff363d3	chore(ci): use Cargo.lock for installed tools	2025-07-07 13:10:55 +02:00
Nicolas Sarlin	7bcd6b94da	chore: use script to pull hpu files	2025-07-07 13:10:55 +02:00
Nicolas Sarlin	57cbab9fe1	chore(backward): integrate backward compat data Code is taken from `59a6179831` Adapted to make ci work	2025-07-07 13:10:55 +02:00
Andrei Stoian	97ce0f6ecf	feat(gpu): update GPU documentation	2025-07-07 09:44:43 +02:00
Nicolas Sarlin	b6c21ef1fe	docs: describe noise squashed compression	2025-07-07 09:32:51 +02:00
Nicolas Sarlin	e599608831	chore(shortint): make `decrypt_no_decode` public	2025-07-07 09:30:14 +02:00
Arthur Meyre	f243491442	chore(docs): add features to the rust_configuration page	2025-07-04 17:06:15 +02:00
Arthur Meyre	b5248930a2	chore(docs): add handbook in explanation section	2025-07-04 17:06:15 +02:00
Arthur Meyre	2d280d98d2	chore(docs): add handbook in the security and cryptography section	2025-07-04 17:06:15 +02:00
Arthur Meyre	10b57f8a8e	chore(docs): add link to GPU and HPU backend docs in the installation page	2025-07-04 17:06:15 +02:00
Arthur Meyre	242df05eb2	chore(docs): add links to GPU and HPU backend on front page	2025-07-04 17:06:15 +02:00
Arthur Meyre	899d4a7750	docs: add noise squashing documentation	2025-07-04 16:08:25 +02:00
Agnes Leroy	48dfeb21dc	chore(gpu): refactor size tracker to avoid future bugs	2025-07-04 14:37:02 +01:00
Skylar Ray	a46ce3fb51	chore: fix typo in classic.rs	2025-07-04 13:33:15 +02:00
Arthur Meyre	192777bde6	chore(ci): handle unverified PRs to autoclose	2025-07-04 13:18:35 +02:00
Dmitry	3aa198311c	fix: broken GPU arg due to typo	2025-07-04 11:04:14 +01:00
David Testé	7034d4ceb4	doc(bench): update benchmark results tables All the results are using parameters set with p-fail of 2-128. CPU tables using parameters set with p-fail 2-64 are removed. GPU tables for 1xH100 and 2xH100 are now replace with the new hardware standard: 8xH100-SXM5. HPU results are added to the backend comparison table and integrate latest operations available.	2025-07-04 10:06:14 +02:00
Arthur Meyre	799ae92f59	chore: remove dead link from docs	2025-07-04 10:04:22 +02:00
Arthur Meyre	36e9371fdf	test: use hamming weight = 1/2 for core noise tests - allows to have less variability and matches exactly what the noise formulas expect for uniform binary secret keys	2025-07-04 09:55:35 +02:00
Pedro Alves	8c88678ee8	feat(gpu): implement 128-bit multi-bit PBS	2025-07-03 20:34:32 -03:00
leopardracer	e1beea5ecb	chore: Update test_user_docs.rs	2025-07-03 20:08:13 +02:00
Agnes Leroy	701411044b	chore(gpu): update SXM5 cost	2025-07-03 17:00:02 +01:00
JJ-hw	405fdec6b9	fix(hpu): Fix iop_propagate_msb_to_lsb_blockv: propagation in application was not done correctly	2025-07-03 14:31:59 +02:00
Agnes Leroy	b3355e2b2f	chore(gpu): remove template from sum ciphertexts, add two missing delete	2025-07-03 12:51:29 +01:00
Agnes Leroy	e4d856afdf	chore(gpu): update noise squashing parameters	2025-07-03 12:51:19 +01:00
Pedro Alves	22ddba7145	fix(gpu): refactor the (128-bit and regular) classical PBS entry point to remove the num_samples parameter - fixes the throughput for those PBSs - also fixes the throughput benchmark for regular PBSs	2025-07-03 08:23:09 -03:00
David Testé	d955696fe0	chore(bench): reduce number of bit sizes to benchmark This is done to reduce execution time since 4 bits precision is not useful to measure.	2025-07-03 12:45:02 +02:00
Baptiste Roux	eb0b9643bb	fix(hpu): Fix clippy_hpu_mockup makefile entry	2025-07-03 10:28:52 +02:00
Arthur Meyre	d68305e984	chore: change link to point to the FHE.org discord for support	2025-07-03 10:28:10 +02:00
Enzo Di Maria	3d64316c66	refactor(gpu): moving signed_scalar_div_async and get_signed_scalar_div to the backend	2025-07-03 08:52:04 +01:00
Agnes Leroy	4bba35e926	chore(gpu): remove m3_c3 & gf 3 params from multi-gpu tests to reduce ci time	2025-07-02 17:18:26 +01:00