chore(gpu): remove sub streams from overflowing subtraction

2026-04-28 03:01:21 -04:00 · 2025-09-22 09:53:08 +02:00
407 changed files with 10902 additions and 25449 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -7,8 +7,6 @@ self-hosted-runner:
    - large_ubuntu_16
    - large_ubuntu_16-22.04
    - v80-desktop
-    - v80-marais
-    - v80-couperin
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -66,9 +66,14 @@ jobs:
      - name: Checkout tfhe-rs
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
-          persist-credentials: 'true' # Needed to pull lfs data
+          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
      # Cache key is an aggregated hash of lfs files hashes
      - name: Get LFS data sha
        id: hash-lfs-data
@@ -78,7 +83,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -90,16 +95,6 @@ jobs:
        run: |
          make pull_backward_compat_data

-      # Pull token was stored by action/checkout to be used by lfs, we don't need it anymore
-      - name: Remove git credentials
-        run: |
-          git config --local --unset-all http.https://github.com/.extraheader
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
-
      - name: Run backward compatibility tests
        run: |
          make test_backward_compatibility_ci
@@ -107,7 +102,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -217,7 +217,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            ~/.nvm
@@ -230,7 +230,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -107,7 +107,6 @@ jobs:
      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 480 # 8 hours
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -78,7 +78,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            ~/.nvm
@@ -91,7 +91,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -3,16 +3,6 @@ name: benchmark_core_crypto

 on:
  workflow_dispatch:
-    inputs:
-      param_type:
-        description: "Parameters type"
-        type: choice
-        default: classical
-        options:
-          - classical
-          - multi_bit
-          - both
-
  schedule:
    # Weekly benchmarks will be triggered each Saturday at 5a.m.
    - cron: '0 5 * * 6'
@@ -32,38 +22,8 @@ env:
 permissions: {}

 jobs:
-  prepare-matrix:
-    name: benchmark_core_crypto/prepare-matrix
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
-      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
-    outputs:
-      param_type: ${{ steps.set_param_type.outputs.param_type }}
-    steps:
-      - name: Set parameters types
-        if: github.event_name == 'workflow_dispatch'
-        run: |
-          if [[ "${INPUTS_PARAM_TYPE}" == "both" ]]; then
-            echo "PARAM_TYPE=[\"classical\", \"multi_bit\"]" >> "${GITHUB_ENV}"
-          else
-            echo "PARAM_TYPE=[\"${INPUTS_PARAM_TYPE}\"]" >> "${GITHUB_ENV}"
-          fi
-        env:
-          INPUTS_PARAM_TYPE: ${{ inputs.param_type }}
-
-      - name: Default parameters type
-        if: github.event_name != 'workflow_dispatch'
-        run: |
-          echo "PARAM_TYPE=[\"classical\"]" >> "${GITHUB_ENV}"
-
-      - name: Set parameters types output
-        id: set_param_type
-        run: | # zizmor: ignore[template-injection] this env variable is safe
-          echo "param_type=${{ toJSON(env.PARAM_TYPE) }}" >> "${GITHUB_OUTPUT}"
-
  setup-instance:
    name: benchmark_core_crypto/setup-instance
-    needs: prepare-matrix
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
@@ -83,16 +43,11 @@ jobs:

  core-crypto-benchmarks:
    name: benchmark_core_crypto/core-crypto-benchmarks
-    needs: [ prepare-matrix, setup-instance ]
+    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    timeout-minutes: 1440  # 24 hours
-    strategy:
-      max-parallel: 1
-      matrix:
-        param_type: ${{ fromJSON(needs.prepare-matrix.outputs.param_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
@@ -123,8 +78,6 @@ jobs:
          make bench_pbs
          make bench_pbs128
          make bench_ks
-        env:
-          BENCH_PARAM_TYPE: ${{ matrix.param_type }}

      - name: Parse results
        run: |
@@ -143,7 +96,7 @@ jobs:
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_core_crypto_${{ matrix.param_type }}_pbs
+          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
--- a/.github/workflows/benchmark_hpu_hlapi.yml
+++ b/.github/workflows/benchmark_hpu_hlapi.yml
@@ -16,7 +16,7 @@ permissions: {}
 jobs:
  hlapi-benchmarks-hpu:
    name: Execute HLAPI benchmarks for HPU backend
-    runs-on: v80-marais
+    runs-on: v80-desktop
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -60,14 +60,11 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

-      - name: Select HPU board
-        run: |
-          echo "V80_PCIE_DEV=24" >> "${GITHUB_ENV}"
-          echo "V80_SERIAL_NUMBER=XFL12NWY3ZKG" >> "${GITHUB_ENV}"
-
      - name: Run benchmarks
        run: |
          make pull_hpu_files
+          export V80_SERIAL_NUMBER=XFL12E4XJXWK
+          source /opt/xilinx/Vivado/2024.2/settings64.sh
          make bench_hlapi_erc20_hpu
          make bench_hlapi_hpu

--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -29,7 +29,7 @@ permissions: {}
 jobs:
  prepare-matrix:
    name: Prepare operations matrix
-    runs-on: v80-marais
+    runs-on: v80-desktop
    outputs:
      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
    steps:
@@ -48,17 +48,17 @@ jobs:
        if: github.event_name != 'workflow_dispatch'
        run: |
          echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
-      
+
+
      - name: Set benchmark types output
        id: set_bench_type
        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

-
  integer-benchmarks-hpu:
    name: benchmark_hpu_integer/integer-benchmarks-hpu
    needs: prepare-matrix
-    runs-on: v80-marais
+    runs-on: v80-desktop
    concurrency:
      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
@@ -111,15 +111,11 @@ jobs:
        run: |
          echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"

-      - name: Select HPU board
-        run: |
-          echo "V80_PCIE_DEV=24" >> "${GITHUB_ENV}"
-          echo "V80_SERIAL_NUMBER=XFL12NWY3ZKG" >> "${GITHUB_ENV}"
-
      - name: Run benchmarks
        run: |
-          echo "${V80_PCIE_DEV} ${V80_SERIAL_NUMBER}"
          make pull_hpu_files
+          export V80_SERIAL_NUMBER=XFL12E4XJXWK
+          source /opt/xilinx/Vivado/2024.2/settings64.sh
          make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
        env:
          BENCH_TYPE: ${{ matrix.bench_type }}
--- a/.github/workflows/benchmark_perf_regression.yml
+++ b/.github/workflows/benchmark_perf_regression.yml
@@ -44,7 +44,7 @@ jobs:
      custom-env: ${{ steps.get_custom_env.outputs.custom_env }}
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -132,7 +132,7 @@ jobs:
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -159,7 +159,7 @@ jobs:
        command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -213,7 +213,7 @@ jobs:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -117,7 +117,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            ~/.nvm
@@ -130,7 +130,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/cargo_audit.yml
+++ b/.github/workflows/cargo_audit.yml
@@ -1,6 +1,4 @@
 # Run cargo audit
-name: cargo_audit
-
 on:
  workflow_dispatch:
  schedule:
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -18,95 +18,17 @@ permissions:
  contents: read

 jobs:
-  prepare-parallel-pcc-matrix:
-    name: cargo_build/prepare-parallel-pcc-matrix
-    runs-on: ubuntu-latest
-    outputs:
-      matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          persist-credentials: "false"
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      # Fetch all the Make recipes that start with `pcc_batch_`
-      - name: Set pcc commands matrix
-        id: set-pcc-commands-matrix
-        run: |
-          COMMANDS=$(grep -oE '^pcc_batch_[^:]*:' Makefile | sed 's/:/\"/; s/^/\"/' | paste -sd,)
-          echo "commands=[${COMMANDS}]" >> "$GITHUB_OUTPUT"
-
-  parallel-pcc-cpu:
-    name: cargo_build/parallel-pcc-cpu
-    needs: prepare-parallel-pcc-matrix
-    runs-on: large_ubuntu_16
-    strategy:
-      matrix:
-        command: ${{fromJson(needs.prepare-parallel-pcc-matrix.outputs.matrix_command)}}
-      fail-fast: false
-    steps:
-      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
-
-      - name: Run pcc checks batch
-        run: |
-          make "${COMMAND}"
-        env:
-          COMMAND: ${{ matrix.command }}
-
-  pcc-hpu:
-    name: cargo_build/pcc-hpu
-    runs-on: large_ubuntu_16
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
-
-      - name: Run Hpu pcc checks
-        run: |
-          make pcc_hpu
-
-  build-tfhe-full:
-    name: cargo_build/build-tfhe-full
+  cargo-builds:
+    name: cargo_build/cargo-builds (bpr)
    runs-on: ${{ matrix.os }}
+
    strategy:
      matrix:
        # GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
        # even with a few PRs
-        os: [large_ubuntu_16, macos-latest-xlarge, large_windows_16_latest]
+        os: [large_ubuntu_16, macos-latest, windows-latest]
      fail-fast: false
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}

-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
-
-      - name: Build Release tfhe full
-        run: |
-          make build_tfhe_full
-
-  build:
-    name: cargo_build/build
-    runs-on: large_ubuntu_16
    steps:
      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
@@ -119,6 +41,7 @@ jobs:
          toolchain: stable

      - name: Install and run newline linter checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
          echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
@@ -127,93 +50,60 @@ jobs:
          mv linelint-linux-amd64 /usr/local/bin/linelint
          make check_newline

+      - name: Run pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
+        run: |
+          make pcc
+
      - name: Build tfhe-csprng
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_tfhe_csprng

      - name: Build with MSRV
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_tfhe_msrv

-      - name: Build coverage tests
-        run: |
-          make build_tfhe_coverage
-
-  build-layers:
-    name: cargo_build/build-layers
-    runs-on: large_ubuntu_16
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
-
      - name: Build Release core
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_core AVX512_SUPPORT=ON
          make build_core_experimental AVX512_SUPPORT=ON

      - name: Build Release boolean
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_boolean

      - name: Build Release shortint
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_shortint

      - name: Build Release integer
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_integer

-  build-c-api:
-    name: cargo_build/build-c-api
-    runs-on: large_ubuntu_16
-    steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
-        with:
-          toolchain: stable
+      - name: Build Release tfhe full
+        run: |
+          make build_tfhe_full

      - name: Build Release c_api
+        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
          make build_c_api

+      - name: Build coverage tests
+        if: ${{ contains(matrix.os, 'ubuntu') }}
+        run: |
+          make build_tfhe_coverage
+
+      - name: Run Hpu pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
+        run: |
+          make pcc_hpu
+
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
-
-  cargo-builds:
-    name: cargo_build/cargo-builds (bpr)
-    needs: [ parallel-pcc-cpu, pcc-hpu, build-tfhe-full, build, build-layers, build-c-api ]
-    if: ${{ always() }}
-    runs-on: ubuntu-latest
-    steps:
-      - name: Check all builds success
-        if: needs.parallel-pcc-cpu.result == 'success' &&
-          needs.pcc-hpu.result == 'success' &&
-          needs.build-tfhe-full.result == 'success' &&
-          needs.build.result == 'success' &&
-          needs.build-layers.result == 'success' &&
-          needs.build-c-api.result == 'success'
-        run: |
-          echo "All tfhe-rs build checks passed"
-
-      - name: Check builds failure
-        if: needs.parallel-pcc-cpu.result != 'success' ||
-          needs.pcc-hpu.result != 'success' ||
-          needs.build-tfhe-full.result != 'success' ||
-          needs.build.result != 'success' ||
-          needs.build-layers.result != 'success' ||
-          needs.build-c-api.result != 'success'
-        run: |
-          echo "Some tfhe-rs build checks failed"
-          exit 1
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -42,7 +42,7 @@ jobs:
          GH_TOKEN: ${{ env.CHECKOUT_TOKEN }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@9e9574ef04ea69da568d6249bd69539ccc704e74 # v4.0.0
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@fc87bb5b5a97953d987372e74478de634726b3e5 # v3.0.25
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/coprocessor-benchmark-gpu.yml
+++ b/.github/workflows/coprocessor-benchmark-gpu.yml
@@ -193,7 +193,7 @@ jobs:
        uses: foundry-rs/foundry-toolchain@82dee4ba654bd2146511f85f0d013af94670c4de

      - name: Cache cargo
-        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
        with:
          path: |
            ~/.cargo/registry
@@ -210,7 +210,7 @@ jobs:
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Login to Chainguard Registry
-        uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
+        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
        with:
          registry: cgr.dev
          username: ${{ secrets.CGR_USERNAME }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -101,12 +101,6 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-cuda-backend
-
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        with:
-          name: crate-tfhe-cuda-backend
-          path: target/package/*.crate
-
      - name: generate hash
        id: hash
        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
@@ -175,12 +169,6 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

-      - name: Download artifact
-        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
-        with:
-          name: crate-tfhe-cuda-backend
-          path: target/package
-
      - name: Authenticate on registry
        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
        id: auth
--- a/.github/workflows/make_release_hpu.yml
+++ b/.github/workflows/make_release_hpu.yml
@@ -24,13 +24,6 @@ jobs:
    with:
      package-name: "tfhe-hpu-backend"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_tfhe.yml
+++ b/.github/workflows/make_release_tfhe.yml
@@ -42,13 +42,6 @@ jobs:
    with:
      package-name: "tfhe"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
@@ -85,7 +78,7 @@ jobs:

      - name: Publish web package
        if: ${{ inputs.push_web_package }}
-        uses: JS-DevTools/npm-publish@7f8fe47b3bea1be0c3aec2b717c5ec1f3e03410b
+        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
@@ -103,7 +96,7 @@ jobs:

      - name: Publish Node package
        if: ${{ inputs.push_node_package }}
-        uses: JS-DevTools/npm-publish@7f8fe47b3bea1be0c3aec2b717c5ec1f3e03410b
+        uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
        with:
          token: ${{ secrets.NPM_TOKEN }}
          package: tfhe/pkg/package.json
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -17,13 +17,6 @@ jobs:
    with:
      package-name: "tfhe-csprng"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -25,13 +25,6 @@ jobs:
    with:
      package-name: "tfhe-fft"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -25,13 +25,6 @@ jobs:
    with:
      package-name: "tfhe-ntt"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -24,13 +24,6 @@ jobs:
    with:
      package-name: "tfhe-versionable-derive"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
@@ -46,13 +39,6 @@ jobs:
    with:
      package-name: "tfhe-versionable"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -24,13 +24,6 @@ jobs:
    with:
      package-name: "tfhe-zk-pok"
      dry-run: ${{ inputs.dry_run }}
-    permissions:
-      # Needed to detect the GitHub Actions environment
-      actions: read
-      # Needed to create the provenance via GitHub OIDC
-      id-token: write
-      # Needed to upload assets/artifacts
-      contents: write
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,7 +24,7 @@ exclude = [
 ]
 [workspace.dependencies]
 aligned-vec = { version = "0.6", default-features = false }
-bytemuck = "<1.24"
+bytemuck = "1.14.3"
 dyn-stack = { version = "0.11", default-features = false }
 itertools = "0.14"
 num-complex = "0.4"
@@ -54,10 +54,3 @@ debug-assertions = false

 [workspace.metadata.dylint]
 libraries = [{ path = "utils/tfhe-lints" }]
-
-[profile.debug_lto_off]
-inherits = "dev"
-debug = true
-lto = "off"
-debug-assertions = false
-overflow-checks = false
--- a/72
+++ b/72
@@ -54,7 +54,6 @@ TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

 # tfhe-hpu-backend
 HPU_CONFIG=v80
-V80_PCIE_DEV?=01

 # Exclude these files from coverage reports
 define COVERAGE_EXCLUDED_FILES
@@ -746,16 +745,6 @@ test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=integer,gpu -p tfhe -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture

-.PHONY: build_debug_integer_short_run_gpu # Run the long run integer tests on the gpu backend
-build_debug_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test -vv --no-run --profile debug_lto_off \
-		--features=integer,gpu-debug-fake-multi-gpu -p tfhe
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile debug_lto_off \
-		--features=integer,gpu-debug-fake-multi-gpu -p tfhe -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence::test_gpu_short_random --list
-	@echo "To debug fake-multi-gpu short run tests run:"
-	@echo "TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE <executable> integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence::test_gpu_short_random_op_sequence_param_gpu_multi_bit_group_4_message_2_carry_2_ks_pbs_tuniform_2m128 --nocapture"
-	@echo "Where <executable> = the one printed in the () in the 'Running unittests src/lib.rs ()' line above"
-
 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -1020,11 +1009,6 @@ build_one_hl_api_test_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
 		   --features=integer,gpu-debug -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture

-.PHONY: build_one_hl_api_test_fake_multi_gpu
-build_one_hl_api_test_fake_multi_gpu: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		   --features=integer,gpu-debug-fake-multi-gpu -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture
-
 test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
 ifeq ($(HPU_CONFIG), v80)
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
@@ -1205,8 +1189,6 @@ check_compile_tests: install_rs_build_toolchain
 		--features=experimental,boolean,shortint,integer,internal-keycache \
 		-p tfhe

-.PHONY: check_compile_tests_c_api # Build C API tests without running them
-check_compile_tests_c_api: install_rs_build_toolchain
 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
 		"$(MAKE)" build_c_api && \
 		./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
@@ -1344,12 +1326,11 @@ bench_signed_integer_gpu: install_rs_check_toolchain

 .PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
 bench_integer_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG); \
-	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
+	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
@@ -1522,22 +1503,21 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 bench_hlapi: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi \
-	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
+	--features=integer,internal-keycache,nightly-avx512 -p tfhe-benchmark --

 .PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
 bench_hlapi_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi \
-	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
+	--features=integer,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark --

 .PHONY: bench_hlapi_hpu # Run benchmarks for HLAPI operations on HPU
 bench_hlapi_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG); \
-	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi \
-	--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
+	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --

 .PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
 bench_hlapi_erc20: install_rs_check_toolchain
@@ -1565,8 +1545,7 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain

 .PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
 bench_hlapi_erc20_hpu: install_rs_check_toolchain
-	source ./setup_hpu.sh --config $(HPU_CONFIG); \
-	export V80_PCIE_DEV=${V80_PCIE_DEV}; \
+	source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
@@ -1683,38 +1662,11 @@ sha256_bool: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example sha256_bool --features=boolean

-.PHONY: pcc # pcc stands for pre commit checks for CPU compilation
-pcc: pcc_batch_1 pcc_batch_2 pcc_batch_3 pcc_batch_4 pcc_batch_5 pcc_batch_6 pcc_batch_7
-
-#
-# PCC split into several batches to speed-up CI feedback.
-# Each batch have roughly the same execution time.
-# Durations are given from GitHub Ubuntu large runner with 16 CPU.
-#
-
-.PHONY: pcc_batch_1 # duration: 6'10''
-pcc_batch_1: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash test_tfhe_lints tfhe_lints \
-clippy_rustdoc
-
-.PHONY: pcc_batch_2 # duration: 6'10''
-pcc_batch_2: clippy clippy_all_targets
-
-.PHONY: pcc_batch_3 # duration: 6'50''
-pcc_batch_3: clippy_shortint clippy_integer
-
-.PHONY: pcc_batch_4 # duration: 7'40''
-pcc_batch_4: clippy_core clippy_js_wasm_api clippy_ws_tests clippy_bench
-
-.PHONY: pcc_batch_5 # duration: 7'20''
-pcc_batch_5: clippy_tfhe_lints check_compile_tests clippy_backward_compat_data
-
-.PHONY: pcc_batch_6  # duration: 4'50'' (shortest one, extend it with further checks)
-pcc_batch_6: clippy_boolean clippy_c_api clippy_tasks clippy_tfhe_csprng clippy_zk_pok \
-clippy_trivium clippy_versionable clippy_param_dedup
-
-.PHONY: pcc_batch_7 # duration: 7'50'' (currently PCC execution bottleneck)
-pcc_batch_7: check_compile_tests_c_api
+.PHONY: pcc # pcc stands for pre commit checks (except GPU)
+pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
+check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
+clippy_all check_compile_tests test_tfhe_lints \
+tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ production-ready library for all the advanced features of TFHE.
 - **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
 - **Size-efficient public key encryption**
 - **Ciphertext and server key compression** for efficient data transfer
- **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.
+- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.

 *Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
 <br></br>
@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
 ```

 > [!Note]
-> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
+> Note: You need to use Rust version >= 1.84 to compile TFHE-rs.

 > [!Note]
 > Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
@@ -147,7 +147,7 @@ To run this code, use the following command:

 > [!Note]
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
-to run in release mode with cargo's `--release` flag to have the best performance possible.
+to run in release mode with cargo's `--release` flag to have the best performances possible.

 *Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*

--- a/_typos.toml
+++ b/_typos.toml
@@ -13,7 +13,6 @@ extend-ignore-identifiers-re = [
    # Example in trivium
    "C9217BA0D762ACA1",
    "0x[0-9a-fA-F]+",
-    "xrt_coreutil",
 ]

 [files]
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.12.0"
+version = "0.11.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -20,4 +20,3 @@ bindgen = "0.71"
 experimental-multi-arch = []
 profile = []
 debug = []
-debug-fake-multi-gpu = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -48,16 +48,13 @@ fn main() {
        // Conditionally pass the "USE_NVTOOLS" variable to CMake if the feature is enabled
        if cfg!(feature = "profile") {
            cmake_config.define("USE_NVTOOLS", "ON");
+            println!("cargo:rustc-link-lib=nvToolsExt");
        } else {
            cmake_config.define("USE_NVTOOLS", "OFF");
        }

        if cfg!(feature = "debug") {
            cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
-        } else if cfg!(feature = "debug-fake-multi-gpu") {
-            cmake_config.define("CMAKE_BUILD_TYPE", "DebugOnlyCpu");
-            cmake_config.define("CMAKE_VERBOSE_MAKEFILE", "ON");
-            cmake_config.define("FAKE_MULTI_GPU", "ON");
        }

        // Build the CMake project
@@ -84,7 +81,6 @@ fn main() {
            "cuda/include/ciphertext.h",
            "cuda/include/integer/compression/compression.h",
            "cuda/include/integer/integer.h",
-            "cuda/include/aes/aes.h",
            "cuda/include/zk/zk.h",
            "cuda/include/keyswitch/keyswitch.h",
            "cuda/include/keyswitch/ks_enums.h",
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -87,9 +87,6 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
  add_definitions(-DDEBUG)
  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
  set(USE_NVTOOLS 1)
-elseif(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debugonlycpu")
-  message("Compiling GPU kernels in Release and CPU code in Debug")
-  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -g")
 else()
  # Release mode
  message("Compiling in Release mode")
@@ -102,11 +99,6 @@ if(${USE_NVTOOLS})
  add_definitions(-DUSE_NVTOOLS)
 endif()

-if(${FAKE_MULTI_GPU})
-  message(STATUS "Fake multi-gpu debugging is enabled")
-  add_definitions(-DDEBUG_FAKE_MULTI_GPU)
-endif()
-
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
 # nvtx when profiling -lnvToolsExt
 set(CMAKE_CUDA_FLAGS
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes.h
@@ -1,44 +0,0 @@
-#ifndef AES_H
-#define AES_H
-#include "../integer/integer.h"
-
-extern "C" {
-uint64_t scratch_cuda_integer_aes_encrypt_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism);
-
-void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
-                                     CudaRadixCiphertextFFI *output,
-                                     CudaRadixCiphertextFFI const *iv,
-                                     CudaRadixCiphertextFFI const *round_keys,
-                                     const uint64_t *counter_bits_le_all_blocks,
-                                     uint32_t num_aes_inputs, int8_t *mem_ptr,
-                                     void *const *bsks, void *const *ksks);
-
-void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
-                                         int8_t **mem_ptr_void);
-
-uint64_t scratch_cuda_integer_key_expansion_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type);
-
-void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
-                                   CudaRadixCiphertextFFI *expanded_keys,
-                                   CudaRadixCiphertextFFI const *key,
-                                   int8_t *mem_ptr, void *const *bsks,
-                                   void *const *ksks);
-
-void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
-                                           int8_t **mem_ptr_void);
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -1,440 +0,0 @@
-#ifndef AES_UTILITIES
-#define AES_UTILITIES
-#include "../integer/integer_utilities.h"
-
-/**
- * This structure holds pre-computed LUTs for essential bitwise operations
- * required by the homomorphic AES circuit. Pre-computing these tables allows
- * for efficient application of non-linear functions like AND during the PBS
- * process. It includes LUTs for:
- * - AND: for the non-linear part of the S-Box.
- * - FLUSH: to clear carry bits and isolate the message bit (x -> x & 1).
- * - CARRY: to extract the carry bit for additions (x -> (x >> 1) & 1).
- */
-template <typename Torus> struct int_aes_lut_buffers {
-  int_radix_lut<Torus> *and_lut;
-  int_radix_lut<Torus> *flush_lut;
-  int_radix_lut<Torus> *carry_lut;
-
-  int_aes_lut_buffers(CudaStreams streams, const int_radix_params &params,
-                      bool allocate_gpu_memory, uint32_t num_aes_inputs,
-                      uint32_t sbox_parallelism, uint64_t &size_tracker) {
-
-    constexpr uint32_t AES_STATE_BITS = 128;
-    constexpr uint32_t SBOX_MAX_AND_GATES = 18;
-
-    this->and_lut = new int_radix_lut<Torus>(
-        streams, params, 1,
-        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
-        allocate_gpu_memory, size_tracker);
-    std::function<Torus(Torus, Torus)> and_lambda =
-        [](Torus a, Torus b) -> Torus { return a & b; };
-    generate_device_accumulator_bivariate<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
-        this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, and_lambda, allocate_gpu_memory);
-    auto active_streams_and_lut = streams.active_gpu_subset(
-        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
-    this->and_lut->broadcast_lut(active_streams_and_lut);
-
-    this->flush_lut = new int_radix_lut<Torus>(
-        streams, params, 1, AES_STATE_BITS * num_aes_inputs,
-        allocate_gpu_memory, size_tracker);
-    std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
-      return x & 1;
-    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
-        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, flush_lambda, allocate_gpu_memory);
-    auto active_streams_flush_lut =
-        streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
-    this->flush_lut->broadcast_lut(active_streams_flush_lut);
-
-    this->carry_lut = new int_radix_lut<Torus>(
-        streams, params, 1, num_aes_inputs, allocate_gpu_memory, size_tracker);
-    std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
-      return (x >> 1) & 1;
-    };
-    generate_device_accumulator(
-        streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
-        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, carry_lambda, allocate_gpu_memory);
-    auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
-    this->carry_lut->broadcast_lut(active_streams_carry_lut);
-  }
-
-  void release(CudaStreams streams) {
-    this->and_lut->release(streams);
-    delete this->and_lut;
-    this->and_lut = nullptr;
-
-    this->flush_lut->release(streams);
-    delete this->flush_lut;
-    this->flush_lut = nullptr;
-
-    this->carry_lut->release(streams);
-    delete this->carry_lut;
-    this->carry_lut = nullptr;
-  }
-};
-
-/**
- * The operations within an AES round, particularly MixColumns, require
- * intermediate storage for calculations. These buffers are designed to hold
- * temporary values like copies of columns or the results of multiplications,
- * avoiding overwriting data that is still needed in the same round.
- */
-template <typename Torus> struct int_aes_round_workspaces {
-  CudaRadixCiphertextFFI *mix_columns_col_copy_buffer;
-  CudaRadixCiphertextFFI *mix_columns_mul_workspace_buffer;
-  CudaRadixCiphertextFFI *vec_tmp_bit_buffer;
-
-  int_aes_round_workspaces(CudaStreams streams, const int_radix_params &params,
-                           bool allocate_gpu_memory, uint32_t num_aes_inputs,
-                           uint64_t &size_tracker) {
-
-    constexpr uint32_t BITS_PER_BYTE = 8;
-    constexpr uint32_t BYTES_PER_COLUMN = 4;
-    constexpr uint32_t BITS_PER_COLUMN = BITS_PER_BYTE * BYTES_PER_COLUMN;
-    constexpr uint32_t MIX_COLUMNS_MUL_WORKSPACE_BYTES = BYTES_PER_COLUMN + 1;
-
-    this->mix_columns_col_copy_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->mix_columns_col_copy_buffer, BITS_PER_COLUMN * num_aes_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->mix_columns_mul_workspace_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->mix_columns_mul_workspace_buffer,
-        MIX_COLUMNS_MUL_WORKSPACE_BYTES * BITS_PER_BYTE * num_aes_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->vec_tmp_bit_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->vec_tmp_bit_buffer,
-        num_aes_inputs, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-  }
-
-  void release(CudaStreams streams, bool allocate_gpu_memory) {
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->mix_columns_col_copy_buffer,
-                                   allocate_gpu_memory);
-    delete this->mix_columns_col_copy_buffer;
-    this->mix_columns_col_copy_buffer = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->mix_columns_mul_workspace_buffer,
-                                   allocate_gpu_memory);
-    delete this->mix_columns_mul_workspace_buffer;
-    this->mix_columns_mul_workspace_buffer = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->vec_tmp_bit_buffer,
-                                   allocate_gpu_memory);
-    delete this->vec_tmp_bit_buffer;
-    this->vec_tmp_bit_buffer = nullptr;
-  }
-};
-
-/**
- * In CTR mode, a counter is homomorphically added to the encrypted IV. This
- * structure holds the necessary buffers for this 128-bit ripple-carry
- * addition, such as the buffer for the propagating carry bit
- * (`vec_tmp_carry_buffer`) across the addition chain.
- */
-template <typename Torus> struct int_aes_counter_workspaces {
-  CudaRadixCiphertextFFI *vec_tmp_carry_buffer;
-  CudaRadixCiphertextFFI *vec_tmp_sum_buffer;
-  CudaRadixCiphertextFFI *vec_trivial_b_bits_buffer;
-  Torus *h_counter_bits_buffer;
-  Torus *d_counter_bits_buffer;
-
-  int_aes_counter_workspaces(CudaStreams streams,
-                             const int_radix_params &params,
-                             bool allocate_gpu_memory, uint32_t num_aes_inputs,
-                             uint64_t &size_tracker) {
-
-    this->vec_tmp_carry_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->vec_tmp_carry_buffer,
-        num_aes_inputs, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->vec_tmp_sum_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->vec_tmp_sum_buffer,
-        num_aes_inputs, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->vec_trivial_b_bits_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->vec_trivial_b_bits_buffer, num_aes_inputs,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->h_counter_bits_buffer =
-        (Torus *)malloc(num_aes_inputs * sizeof(Torus));
-    size_tracker += num_aes_inputs * sizeof(Torus);
-    this->d_counter_bits_buffer = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_aes_inputs * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
-        size_tracker, allocate_gpu_memory);
-  }
-
-  void release(CudaStreams streams, bool allocate_gpu_memory) {
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->vec_tmp_carry_buffer,
-                                   allocate_gpu_memory);
-    delete this->vec_tmp_carry_buffer;
-    this->vec_tmp_carry_buffer = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->vec_tmp_sum_buffer,
-                                   allocate_gpu_memory);
-    delete this->vec_tmp_sum_buffer;
-    this->vec_tmp_sum_buffer = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->vec_trivial_b_bits_buffer,
-                                   allocate_gpu_memory);
-    delete this->vec_trivial_b_bits_buffer;
-    this->vec_trivial_b_bits_buffer = nullptr;
-
-    free(this->h_counter_bits_buffer);
-    if (allocate_gpu_memory) {
-      cuda_drop_async(this->d_counter_bits_buffer, streams.stream(0),
-                      streams.gpu_index(0));
-      streams.synchronize();
-    }
-  }
-};
-
-/**
- * This structure allocates the most significant memory blocks:
- * - `sbox_internal_workspace`: A large workspace for the complex, parallel
- * evaluation of the S-Box circuit.
- * - `main_bitsliced_states_buffer`: Holds the entire set of AES states in a
- * bitsliced layout, which is optimal for parallel bitwise operations on the
- * GPU.
- * - Other buffers are used for data layout transformations (transposition) and
- * for batching small operations into larger, more efficient launches.
- */
-template <typename Torus> struct int_aes_main_workspaces {
-  CudaRadixCiphertextFFI *sbox_internal_workspace;
-  CudaRadixCiphertextFFI *initial_states_and_jit_key_workspace;
-  CudaRadixCiphertextFFI *main_bitsliced_states_buffer;
-  CudaRadixCiphertextFFI *tmp_tiled_key_buffer;
-  CudaRadixCiphertextFFI *batch_processing_buffer;
-
-  int_aes_main_workspaces(CudaStreams streams, const int_radix_params &params,
-                          bool allocate_gpu_memory, uint32_t num_aes_inputs,
-                          uint32_t sbox_parallelism, uint64_t &size_tracker) {
-
-    constexpr uint32_t AES_STATE_BITS = 128;
-    constexpr uint32_t SBOX_MAX_AND_GATES = 18;
-    constexpr uint32_t BATCH_BUFFER_OPERANDS = 3;
-
-    this->sbox_internal_workspace = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->sbox_internal_workspace,
-        num_aes_inputs * AES_STATE_BITS * sbox_parallelism,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->initial_states_and_jit_key_workspace = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->initial_states_and_jit_key_workspace,
-        num_aes_inputs * AES_STATE_BITS, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->main_bitsliced_states_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0),
-        this->main_bitsliced_states_buffer, num_aes_inputs * AES_STATE_BITS,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->tmp_tiled_key_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->tmp_tiled_key_buffer,
-        num_aes_inputs * AES_STATE_BITS, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->batch_processing_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->batch_processing_buffer,
-        num_aes_inputs * SBOX_MAX_AND_GATES * BATCH_BUFFER_OPERANDS *
-            sbox_parallelism,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-  }
-
-  void release(CudaStreams streams, bool allocate_gpu_memory) {
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->sbox_internal_workspace,
-                                   allocate_gpu_memory);
-    delete this->sbox_internal_workspace;
-    this->sbox_internal_workspace = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->initial_states_and_jit_key_workspace,
-                                   allocate_gpu_memory);
-    delete this->initial_states_and_jit_key_workspace;
-    this->initial_states_and_jit_key_workspace = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->main_bitsliced_states_buffer,
-                                   allocate_gpu_memory);
-    delete this->main_bitsliced_states_buffer;
-    this->main_bitsliced_states_buffer = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->tmp_tiled_key_buffer,
-                                   allocate_gpu_memory);
-    delete this->tmp_tiled_key_buffer;
-    this->tmp_tiled_key_buffer = nullptr;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->batch_processing_buffer,
-                                   allocate_gpu_memory);
-    delete this->batch_processing_buffer;
-    this->batch_processing_buffer = nullptr;
-  }
-};
-
-/**
- * This structure acts as a container, holding instances of all the other buffer
- * management structs. It provides a
- * single object to manage the entire lifecycle of memory needed for a complete
- * AES-CTR encryption operation.
- */
-template <typename Torus> struct int_aes_encrypt_buffer {
-  int_radix_params params;
-  bool allocate_gpu_memory;
-  uint32_t num_aes_inputs;
-  uint32_t sbox_parallel_instances;
-
-  int_aes_lut_buffers<Torus> *luts;
-  int_aes_round_workspaces<Torus> *round_workspaces;
-  int_aes_counter_workspaces<Torus> *counter_workspaces;
-  int_aes_main_workspaces<Torus> *main_workspaces;
-
-  int_aes_encrypt_buffer(CudaStreams streams, const int_radix_params &params,
-                         bool allocate_gpu_memory, uint32_t num_aes_inputs,
-                         uint32_t sbox_parallelism, uint64_t &size_tracker) {
-
-    PANIC_IF_FALSE(num_aes_inputs >= 1,
-                   "num_aes_inputs should be greater or equal to 1");
-
-    this->params = params;
-    this->allocate_gpu_memory = allocate_gpu_memory;
-    this->num_aes_inputs = num_aes_inputs;
-    this->sbox_parallel_instances = sbox_parallelism;
-
-    this->luts = new int_aes_lut_buffers<Torus>(
-        streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
-        size_tracker);
-
-    this->round_workspaces = new int_aes_round_workspaces<Torus>(
-        streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
-
-    this->counter_workspaces = new int_aes_counter_workspaces<Torus>(
-        streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
-
-    this->main_workspaces = new int_aes_main_workspaces<Torus>(
-        streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
-        size_tracker);
-  }
-
-  void release(CudaStreams streams) {
-    luts->release(streams);
-    delete luts;
-    luts = nullptr;
-
-    round_workspaces->release(streams, allocate_gpu_memory);
-    delete round_workspaces;
-    round_workspaces = nullptr;
-
-    counter_workspaces->release(streams, allocate_gpu_memory);
-    delete counter_workspaces;
-    counter_workspaces = nullptr;
-
-    main_workspaces->release(streams, allocate_gpu_memory);
-    delete main_workspaces;
-    main_workspaces = nullptr;
-  }
-};
-
-/**
- * This structure holds the buffer for the 44 words of the expanded key
- * and temporary storage for word manipulations.
- * It contains its own instance of `int_aes_encrypt_buffer` because the
- * key expansion algorithm itself requires using the S-Box.
- * This separation ensures that memory for key expansion can be allocated and
- * freed independently of the main encryption process.
- */
-template <typename Torus> struct int_key_expansion_buffer {
-  int_radix_params params;
-  bool allocate_gpu_memory;
-
-  CudaRadixCiphertextFFI *words_buffer;
-
-  CudaRadixCiphertextFFI *tmp_word_buffer;
-  CudaRadixCiphertextFFI *tmp_rotated_word_buffer;
-
-  int_aes_encrypt_buffer<Torus> *aes_encrypt_buffer;
-
-  int_key_expansion_buffer(CudaStreams streams, const int_radix_params &params,
-                           bool allocate_gpu_memory, uint64_t &size_tracker) {
-    this->params = params;
-    this->allocate_gpu_memory = allocate_gpu_memory;
-
-    constexpr uint32_t TOTAL_WORDS = 44;
-    constexpr uint32_t BITS_PER_WORD = 32;
-    constexpr uint32_t TOTAL_BITS = TOTAL_WORDS * BITS_PER_WORD;
-
-    this->words_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->words_buffer, TOTAL_BITS,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    this->tmp_word_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->tmp_word_buffer,
-        BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->tmp_rotated_word_buffer = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), this->tmp_rotated_word_buffer,
-        BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
-        allocate_gpu_memory);
-
-    this->aes_encrypt_buffer = new int_aes_encrypt_buffer<Torus>(
-        streams, params, allocate_gpu_memory, 1, 4, size_tracker);
-  }
-
-  void release(CudaStreams streams) {
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->words_buffer, allocate_gpu_memory);
-    delete this->words_buffer;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->tmp_word_buffer, allocate_gpu_memory);
-    delete this->tmp_word_buffer;
-
-    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
-                                   this->tmp_rotated_word_buffer,
-                                   allocate_gpu_memory);
-    delete this->tmp_rotated_word_buffer;
-
-    this->aes_encrypt_buffer->release(streams);
-    delete this->aes_encrypt_buffer;
-  }
-};
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -5,7 +5,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cuda_runtime.h>
-#include <memory>

 extern "C" {

@@ -141,34 +140,4 @@ template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                          Torus *d_array, Torus value, Torus n);

-template <class T> struct malloc_with_size_tracking_async_deleter {
-private:
-  cudaStream_t _stream;
-  uint32_t _gpu_index;
-  uint64_t &_size_tracker;
-  bool _allocate_gpu_memory;
-
-public:
-  malloc_with_size_tracking_async_deleter(cudaStream_t stream,
-                                          uint32_t gpu_index,
-                                          uint64_t &size_tracker,
-                                          bool allocate_gpu_memory)
-      : _stream(stream), _gpu_index(gpu_index), _size_tracker(size_tracker),
-        _allocate_gpu_memory(allocate_gpu_memory)
-
-  {}
-  void operator()(T *ptr) { cuda_drop_with_size_tracking_async(ptr, _stream, _gpu_index, _allocate_gpu_memory) ; }
-};
-
-template <class T>
-std::shared_ptr<T> cuda_make_shared_with_size_tracking_async(
-    uint64_t size, cudaStream_t stream, uint32_t gpu_index,
-    uint64_t &size_tracker, bool allocate_gpu_memory) {
-  return std::shared_ptr<T>(
-      (T*)cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
-                                           size_tracker, allocate_gpu_memory),
-      malloc_with_size_tracking_async_deleter<T>(
-          stream, gpu_index, size_tracker, allocate_gpu_memory));
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -78,9 +78,10 @@ public:
                       get_active_gpu_count(num_radix_blocks, _gpu_count));
  }

-  // Returns a CudaStreams struct containing only the ith stream
-  CudaStreams get_ith(int i) const {
-    return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
+  // Returns a subset containing only the first gpu of this set. It
+  // is used to create subset of streams for mono-GPU functions
+  CudaStreams subset_first_gpu() const {
+    return CudaStreams(_streams, _gpu_indexes, 1);
  }

  // Synchronize all the streams in the set
@@ -183,93 +184,4 @@ public:
  }
 };

-struct CudaStreamsBarrier {
-private:
-  std::vector<cudaEvent_t> _events;
-  CudaStreams _streams;
-
-  CudaStreamsBarrier(const CudaStreamsBarrier &) {} // Prevent copy-construction
-  CudaStreamsBarrier &operator=(const CudaStreamsBarrier &) {
-    return *this;
-  } // Prevent assignment
-public:
-  void create_on(const CudaStreams &streams) {
-    _streams = streams;
-
-    GPU_ASSERT(streams.count() > 1, "CudaStreamsFirstWaitsWorkersBarrier: "
-                                    "Attempted to create on single GPU");
-    _events.resize(streams.count());
-    for (int i = 0; i < streams.count(); i++) {
-      _events[i] = cuda_create_event(streams.gpu_index(i));
-    }
-  }
-
-  CudaStreamsBarrier(){};
-
-  void local_streams_wait_for_stream_0(const CudaStreams &user_streams) {
-    GPU_ASSERT(!_events.empty(),
-               "CudaStreamsBarrier: must call create_on before use");
-    GPU_ASSERT(user_streams.gpu_index(0) == _streams.gpu_index(0),
-               "CudaStreamsBarrier: synchronization can only be performed on "
-               "the GPUs the barrier was initially created on.");
-
-    cuda_event_record(_events[0], user_streams.stream(0),
-                      user_streams.gpu_index(0));
-    for (int j = 1; j < user_streams.count(); j++) {
-      GPU_ASSERT(user_streams.gpu_index(j) == _streams.gpu_index(j),
-                 "CudaStreamsBarrier: synchronization can only be performed on "
-                 "the GPUs the barrier was initially created on.");
-      cuda_stream_wait_event(user_streams.stream(j), _events[0],
-                             user_streams.gpu_index(j));
-    }
-  }
-
-  void stream_0_wait_for_local_streams(const CudaStreams &user_streams) {
-    GPU_ASSERT(
-        !_events.empty(),
-        "CudaStreamsFirstWaitsWorkersBarrier: must call create_on before use");
-    GPU_ASSERT(
-        user_streams.count() <= _events.size(),
-        "CudaStreamsFirstWaitsWorkersBarrier: trying to synchronize too many "
-        "streams. "
-        "The barrier was created on a LUT that had %lu active streams, while "
-        "the user stream set has %u streams",
-        _events.size(), user_streams.count());
-
-    if (user_streams.count() > 1) {
-      // Worker GPUs record their events
-      for (int j = 1; j < user_streams.count(); j++) {
-        GPU_ASSERT(_streams.gpu_index(j) == user_streams.gpu_index(j),
-                   "CudaStreamsBarrier: The user stream "
-                   "set GPU[%d]=%u while the LUT stream set GPU[%d]=%u",
-                   j, user_streams.gpu_index(j), j, _streams.gpu_index(j));
-
-        cuda_event_record(_events[j], user_streams.stream(j),
-                          user_streams.gpu_index(j));
-      }
-
-      // GPU 0 waits for all workers
-      for (int j = 1; j < user_streams.count(); j++) {
-        cuda_stream_wait_event(user_streams.stream(0), _events[j],
-                               user_streams.gpu_index(0));
-      }
-    }
-  }
-
-  void release() {
-    for (int j = 0; j < _streams.count(); j++) {
-      cuda_event_destroy(_events[j], _streams.gpu_index(j));
-    }
-
-    _events.clear();
-  }
-
-  ~CudaStreamsBarrier() {
-    GPU_ASSERT(_events.empty(),
-               "CudaStreamsBarrier: must "
-               "call release before destruction: events size = %lu",
-               _events.size());
-  }
-};
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -4,6 +4,26 @@
 #include "../../pbs/pbs_enums.h"
 #include "../integer.h"

+typedef struct {
+  void *ptr;
+  uint32_t num_radix_blocks;
+  uint32_t lwe_dimension;
+} CudaLweCiphertextListFFI;
+
+typedef struct {
+  void *ptr;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
+  // smaller)
+  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
+  // each LWE of the group). In the end the total number of bodies is equal to
+  // the number of input LWE
+  uint32_t total_lwe_bodies_count;
+  uint32_t glwe_dimension;
+  uint32_t polynomial_size;
+} CudaPackedGlweCiphertextListFFI;
+
 extern "C" {
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    CudaStreamsFFI streams, int8_t **mem_ptr,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -86,26 +86,6 @@ typedef struct {
  bool const divisor_has_more_bits_than_numerator;
 } CudaScalarDivisorFFI;

-typedef struct {
-  void *ptr;
-  uint32_t num_radix_blocks;
-  uint32_t lwe_dimension;
-} CudaLweCiphertextListFFI;
-
-typedef struct {
-  void *ptr;
-  uint32_t storage_log_modulus;
-  uint32_t lwe_per_glwe;
-  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
-  // smaller)
-  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
-  // each LWE of the group). In the end the total number of bodies is equal to
-  // the number of input LWE
-  uint32_t total_lwe_bodies_count;
-  uint32_t glwe_dimension;
-  uint32_t polynomial_size;
-} CudaPackedGlweCiphertextListFFI;
-
 uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -125,7 +105,9 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
 void cuda_apply_univariate_lut_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks);

 void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
                                             int8_t **mem_ptr_void);
@@ -143,8 +125,9 @@ void cuda_apply_bivariate_lut_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe_1,
    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
-    uint32_t shift);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_radix_blocks, uint32_t shift);

 void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
                                            int8_t **mem_ptr_void);
@@ -152,8 +135,9 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
 void cuda_apply_many_univariate_lut_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_luts,
-    uint32_t lut_stride);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_luts, uint32_t lut_stride);

 uint64_t scratch_cuda_full_propagation_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
@@ -163,10 +147,11 @@ uint64_t scratch_cuda_full_propagation_64(
    PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
-                                      CudaRadixCiphertextFFI *input_blocks,
-                                      int8_t *mem_ptr, void *const *ksks,
-                                      void *const *bsks, uint32_t num_blocks);
+void cuda_full_propagation_64_inplace(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
+    int8_t *mem_ptr, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_blocks);

 void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
                                   int8_t **mem_ptr_void);
@@ -184,8 +169,9 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
-    void *const *bsks, void *const *ksks, int8_t *mem_ptr,
-    uint32_t polynomial_size, uint32_t num_blocks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);

 void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void);

@@ -210,7 +196,8 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(

 void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
@@ -223,7 +210,8 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(

 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
                                                     int8_t **mem_ptr_void);
@@ -243,7 +231,8 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);
@@ -262,13 +251,16 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
    void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_scalar_blocks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_scalar_blocks);

 void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
                                     int8_t **mem_ptr_void);
@@ -286,13 +278,15 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);

@@ -310,7 +304,8 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
    CudaRadixCiphertextFFI const *lwe_condition,
    CudaRadixCiphertextFFI const *lwe_array_true,
    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
                                     int8_t **mem_ptr_void);
@@ -326,7 +321,8 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
                                              int8_t **mem_ptr_void);
@@ -353,13 +349,16 @@ void cuda_propagate_single_carry_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t requested_flag, uint32_t uses_carry);

 void cuda_add_and_propagate_single_carry_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry);

 void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
                                         int8_t **mem_ptr_void);
@@ -381,8 +380,9 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
    const CudaRadixCiphertextFFI *rhs_array,
    CudaRadixCiphertextFFI *overflow_block,
    const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t compute_overflow,
-    uint32_t uses_input_borrow);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t compute_overflow, uint32_t uses_input_borrow);

 void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
                                          int8_t **mem_ptr_void);
@@ -400,7 +400,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -417,6 +418,7 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);

 void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
@@ -435,7 +437,8 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
                                  int8_t **mem_ptr_void);
@@ -452,7 +455,9 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_blocks);

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -471,7 +476,8 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
-    bool is_signed, void *const *bsks, void *const *ksks);
+    bool is_signed, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
                                      int8_t **mem_ptr_void);
@@ -488,7 +494,9 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
 void cuda_integer_are_all_comparisons_block_true_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks);

 void cleanup_cuda_integer_are_all_comparisons_block_true(CudaStreamsFFI streams,
                                                         int8_t **mem_ptr_void);
@@ -505,7 +513,9 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
 void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks);

 void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -531,7 +541,9 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
 void cuda_apply_noise_squashing_kb(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks);

 void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
                                           int8_t **mem_ptr_void);
@@ -549,7 +561,9 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry);

 void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
                                                 int8_t **mem_ptr_void);
@@ -566,6 +580,7 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
 void cuda_integer_unsigned_scalar_div_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
    const CudaScalarDivisorFFI *scalar_divisor_ffi);

 void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
@@ -580,12 +595,11 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
-                                        CudaRadixCiphertextFFI *output,
-                                        CudaRadixCiphertextFFI const *input,
-                                        int8_t *mem_ptr,
-                                        uint32_t num_additional_blocks,
-                                        void *const *bsks, void *const *ksks);
+void cuda_extend_radix_with_sign_msb_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
+    uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
                                                int8_t **mem_ptr_void);
@@ -602,6 +616,7 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
 void cuda_integer_signed_scalar_div_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);

 void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
@@ -620,7 +635,9 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
 void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint64_t const *divisor_has_at_least_one_set,
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    void const *clear_blocks, void const *h_clear_blocks,
@@ -642,7 +659,9 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
 void cuda_integer_signed_scalar_div_rem_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint64_t const *divisor_has_at_least_one_set,
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    uint32_t numerator_bits);
@@ -662,7 +681,8 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
 void cuda_integer_count_of_consecutive_bits_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+    void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);

 void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);
@@ -672,15 +692,16 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks_to_process,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t total_random_bits,
+    PBS_MS_REDUCTION_T noise_reduction_type);

-void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
-                                        CudaRadixCiphertextFFI *radix_lwe_out,
-                                        const void *seeded_lwe_input,
-                                        uint32_t num_blocks_to_process,
-                                        int8_t *mem, void *const *bsks);
+void cuda_integer_grouped_oprf_async_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
+    const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
+    void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
                                          int8_t **mem_ptr_void);
@@ -700,7 +721,8 @@ void cuda_integer_ilog2_kb_64(
    CudaRadixCiphertextFFI const *trivial_ct_neg_n,
    CudaRadixCiphertextFFI const *trivial_ct_2,
    CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);

 void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
                                      int8_t **mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
@@ -3,6 +3,16 @@
 #include <stdint.h>
 enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
 enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
-enum PBS_MS_REDUCTION_T { NO_REDUCTION = 0, CENTERED = 1 };
+enum PBS_MS_REDUCTION_T { NO_REDUCTION = 0, DRIFT = 1, CENTERED = 2 };
+
+extern "C" {
+typedef struct {
+  void *const *ptr;
+  uint32_t num_zeros;
+  double ms_bound;
+  double ms_r_sigma;
+  double ms_input_variance;
+} CudaModulusSwitchNoiseReductionKeyFFI;
+}

 #endif // CUDA_PBS_ENUMS_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -80,6 +80,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

  Torus *global_accumulator;
  double2 *global_join_buffer;
+  Torus *temp_lwe_array_in;

  PBS_VARIANT pbs_variant;
  PBS_MS_REDUCTION_T noise_reduction_type;
@@ -96,6 +97,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
    this->pbs_variant = pbs_variant;

    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+    this->temp_lwe_array_in = (Torus *)cuda_malloc_with_size_tracking_async(
+        (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(Torus),
+        stream, gpu_index, size_tracker,
+        noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT);
    switch (pbs_variant) {
    case PBS_VARIANT::DEFAULT: {
      uint64_t full_sm_step_one =
@@ -229,6 +234,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
    if (pbs_variant == DEFAULT)
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);
+
+    if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
+      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
+                                         gpu_memory_allocated);
  }
 };

@@ -240,6 +249,8 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {

  __uint128_t *global_accumulator;
  double *global_join_buffer;
+  InputTorus *temp_lwe_array_in;
+  uint64_t *trivial_indexes;

  PBS_VARIANT pbs_variant;
  PBS_MS_REDUCTION_T noise_reduction_type;
@@ -257,6 +268,27 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;

+    if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
+      this->temp_lwe_array_in =
+          (InputTorus *)cuda_malloc_with_size_tracking_async(
+              (lwe_dimension + 1) * input_lwe_ciphertext_count *
+                  sizeof(InputTorus),
+              stream, gpu_index, size_tracker, allocate_gpu_memory);
+      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          size_tracker, allocate_gpu_memory);
+      uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
+      for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
+        h_trivial_indexes[i] = i;
+
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          trivial_indexes, h_trivial_indexes,
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          allocate_gpu_memory);
+
+      cuda_synchronize_stream(stream, gpu_index);
+      delete[] h_trivial_indexes;
+    }
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
    size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
                                     input_lwe_ciphertext_count *
@@ -392,6 +424,13 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
    if (pbs_variant == DEFAULT)
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);
+
+    if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
+      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
+                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
+                                         gpu_memory_allocated);
+    }
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -89,14 +89,18 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lut_vector, void const *lwe_array_in,
-    void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
+    void const *bootstrapping_key,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples);

--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -22,7 +22,8 @@ uint64_t scratch_cuda_expand_without_verification_64(
 void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
    const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *computing_ksks, void *const *casting_keys);
+    void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
                                            int8_t **mem_ptr_void);
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
@@ -1,88 +0,0 @@
-#include "../../include/aes/aes.h"
-#include "aes.cuh"
-
-uint64_t scratch_cuda_integer_aes_encrypt_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
-    uint32_t sbox_parallelism) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
-
-  return scratch_cuda_integer_aes_encrypt<uint64_t>(
-      CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
-      params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism);
-}
-
-void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
-                                     CudaRadixCiphertextFFI *output,
-                                     CudaRadixCiphertextFFI const *iv,
-                                     CudaRadixCiphertextFFI const *round_keys,
-                                     const uint64_t *counter_bits_le_all_blocks,
-                                     uint32_t num_aes_inputs, int8_t *mem_ptr,
-                                     void *const *bsks, void *const *ksks) {
-
-  host_integer_aes_ctr_encrypt<uint64_t>(
-      CudaStreams(streams), output, iv, round_keys, counter_bits_le_all_blocks,
-      num_aes_inputs, (int_aes_encrypt_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks);
-}
-
-void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
-                                         int8_t **mem_ptr_void) {
-
-  int_aes_encrypt_buffer<uint64_t> *mem_ptr =
-      (int_aes_encrypt_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(CudaStreams(streams));
-
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
-
-uint64_t scratch_cuda_integer_key_expansion_64(
-    CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory,
-    PBS_MS_REDUCTION_T noise_reduction_type) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus,
-                          noise_reduction_type);
-
-  return scratch_cuda_integer_key_expansion<uint64_t>(
-      CudaStreams(streams), (int_key_expansion_buffer<uint64_t> **)mem_ptr,
-      params, allocate_gpu_memory);
-}
-
-void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
-                                   CudaRadixCiphertextFFI *expanded_keys,
-                                   CudaRadixCiphertextFFI const *key,
-                                   int8_t *mem_ptr, void *const *bsks,
-                                   void *const *ksks) {
-
-  host_integer_key_expansion<uint64_t>(
-      CudaStreams(streams), expanded_keys, key,
-      (int_key_expansion_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
-}
-
-void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
-                                           int8_t **mem_ptr_void) {
-  int_key_expansion_buffer<uint64_t> *mem_ptr =
-      (int_key_expansion_buffer<uint64_t> *)(*mem_ptr_void);
-
-  mem_ptr->release(CudaStreams(streams));
-  delete mem_ptr;
-  *mem_ptr_void = nullptr;
-}
--- a/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -103,6 +103,23 @@ void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
      lwe_dimension, log_modulus);
 }

+// This end point is used only for testing purposes
+// its output always follows trivial ordering
+void cuda_improve_noise_modulus_switch_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus) {
+  host_drift_modulus_switch<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t const *>(lwe_array_in),
+      static_cast<uint64_t const *>(lwe_array_indexes),
+      static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
+      num_zeros, input_variance, r_sigma, bound, log_modulus);
+}
+
 void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -36,7 +36,7 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 *
 */
 // Each thread in x are used to calculate one output.
-// threads in y are used to parallelize the lwe_dimension_in loop.
+// threads in y are used to paralelize the lwe_dimension_in loop.
 // shared memory is used to store intermediate results of the reduction.
 // Note: To reduce register pressure we have slightly changed the algorithm,
 // the idea consists in calculating the negate value of the output. So, instead
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -426,4 +426,31 @@ __global__ void __launch_bounds__(512)
  }
 }

+template <typename Torus>
+__host__ void host_drift_modulus_switch(
+    cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
+    Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
+    uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
+    const double input_variance, const double r_sigma, const double bound,
+    uint32_t log_modulus) {
+
+  PANIC_IF_FALSE(lwe_size >= 512,
+                 "The lwe_size (%d) is less than 512, this is not supported\n",
+                 lwe_size);
+  PANIC_IF_FALSE(
+      lwe_size <= 1024,
+      "The lwe_size (%d) is greater than 1024, this is not supported\n",
+      lwe_size);
+
+  cuda_set_device(gpu_index);
+
+  // This reduction requires a power of two num of threads
+  int num_threads = 512, num_blocks = num_lwes;
+
+  improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
+      array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
+      r_sigma, bound, log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -266,11 +266,6 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    uint32_t gpu_index, bool gpu_memory_allocated) {
  if (size == 0 || !gpu_memory_allocated)
    return;
-  GPU_ASSERT(dest != nullptr,
-             "Cuda error: trying to copy gpu->gpu to null ptr");
-  GPU_ASSERT(src != nullptr,
-             "Cuda error: trying to copy gpu->gpu from null ptr");
-
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
  PANIC_IF_FALSE(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -21,12 +21,14 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
-    bool is_signed, void *const *bsks, void *const *ksks) {
+    bool is_signed, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

  host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
-                                (uint64_t **)(ksks), mem, is_signed);
+                                (uint64_t **)(ksks), ms_noise_reduction_key,
+                                mem, is_signed);
 }

 void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -30,10 +30,11 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
 }

 template <typename Torus>
-__host__ void
-host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
-                    void *const *bsks, uint64_t *const *ksks,
-                    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
+__host__ void host_integer_abs_kb(
+    CudaStreams streams, CudaRadixCiphertextFFI *ct, void *const *bsks,
+    uint64_t *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
  if (!is_signed)
    return;

@@ -48,19 +49,19 @@ host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,

  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
      streams, mask, num_bits_in_ciphertext - 1,
-      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
+      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
  host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
                       ct->num_radix_blocks, mem_ptr->params.message_modulus,
                       mem_ptr->params.carry_modulus);

  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
-  host_propagate_single_carry<Torus>(streams, ct, nullptr, nullptr,
-                                     mem_ptr->scp_mem, bsks, ksks,
-                                     requested_flag, uses_carry);
+  host_propagate_single_carry<Torus>(
+      streams, ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
+      ms_noise_reduction_key, requested_flag, uses_carry);

  host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
-                                     bsks, ksks);
+                                     bsks, ksks, ms_noise_reduction_key);
 }

 #endif // TFHE_RS_ABS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -23,11 +23,13 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_radix_bitop_kb<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      ms_noise_reduction_key);
 }

 void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -16,7 +16,8 @@ __host__ void host_integer_radix_bitop_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks) {
+    void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  PANIC_IF_FALSE(
      lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
@@ -44,8 +45,9 @@ __host__ void host_integer_radix_bitop_kb(
  }

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks, lut,
-      lwe_array_out->num_radix_blocks, lut->params.message_modulus);
+      streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
+      ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
+      lut->params.message_modulus);

  memcpy(lwe_array_out->degrees, degrees,
         lwe_array_out->num_radix_blocks * sizeof(uint64_t));
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -35,17 +35,16 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
      num_blocks, num_additional_blocks, allocate_gpu_memory);
 }

-void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
-                                        CudaRadixCiphertextFFI *output,
-                                        CudaRadixCiphertextFFI const *input,
-                                        int8_t *mem_ptr,
-                                        uint32_t num_additional_blocks,
-                                        void *const *bsks, void *const *ksks) {
+void cuda_extend_radix_with_sign_msb_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
+    uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  PUSH_RANGE("cast")
  host_extend_radix_with_sign_msb<uint64_t>(
      CudaStreams(streams), output, input,
      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
-      num_additional_blocks, bsks, (uint64_t **)ksks);
+      num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -55,7 +55,8 @@ __host__ void host_extend_radix_with_sign_msb(
    CudaStreams streams, CudaRadixCiphertextFFI *output,
    CudaRadixCiphertextFFI const *input,
    int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
-    uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks) {
+    uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  if (num_additional_blocks == 0) {
    PUSH_RANGE("cast/extend no addblocks")
@@ -78,7 +79,8 @@ __host__ void host_extend_radix_with_sign_msb(
      input_blocks - 1, input_blocks);

  host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
-                               mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
+                               mem_ptr->last_block, mem_ptr->lut, ksks,
+                               ms_noise_reduction_key, bsks);

  for (uint32_t i = 0; i < num_additional_blocks; ++i) {
    uint32_t dst_block_idx = input_blocks + i;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -29,12 +29,13 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
    CudaRadixCiphertextFFI const *lwe_condition,
    CudaRadixCiphertextFFI const *lwe_array_true,
    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  PUSH_RANGE("cmux")
  host_integer_radix_cmux_kb<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
      lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (uint64_t **)(ksks), ms_noise_reduction_key);
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -5,13 +5,15 @@
 #include "radix_ciphertext.cuh"

 template <typename Torus>
-__host__ void zero_out_if(CudaStreams streams,
-                          CudaRadixCiphertextFFI *lwe_array_out,
-                          CudaRadixCiphertextFFI const *lwe_array_input,
-                          CudaRadixCiphertextFFI const *lwe_condition,
-                          int_zero_out_if_buffer<Torus> *mem_ptr,
-                          int_radix_lut<Torus> *predicate, void *const *bsks,
-                          Torus *const *ksks, uint32_t num_radix_blocks) {
+__host__ void
+zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
+            CudaRadixCiphertextFFI const *lwe_array_input,
+            CudaRadixCiphertextFFI const *lwe_condition,
+            int_zero_out_if_buffer<Torus> *mem_ptr,
+            int_radix_lut<Torus> *predicate, void *const *bsks,
+            Torus *const *ksks,
+            CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+            uint32_t num_radix_blocks) {
  PANIC_IF_FALSE(
      lwe_array_out->num_radix_blocks >= num_radix_blocks &&
          lwe_array_input->num_radix_blocks >= num_radix_blocks,
@@ -31,13 +33,13 @@ __host__ void zero_out_if(CudaStreams streams,
  // second operand is not an array
  auto tmp_lwe_array_input = mem_ptr->tmp;
  host_pack_bivariate_blocks_with_single_block<Torus>(
-      streams, tmp_lwe_array_input, predicate->lwe_indexes_in.get(), lwe_array_input,
-      lwe_condition, predicate->lwe_indexes_in.get(), params.message_modulus,
+      streams, tmp_lwe_array_input, predicate->lwe_indexes_in, lwe_array_input,
+      lwe_condition, predicate->lwe_indexes_in, params.message_modulus,
      num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks, predicate,
-      num_radix_blocks);
+      streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks,
+      ms_noise_reduction_key, predicate, num_radix_blocks);
 }

 template <typename Torus>
@@ -46,7 +48,8 @@ __host__ void host_integer_radix_cmux_kb(
    CudaRadixCiphertextFFI const *lwe_condition,
    CudaRadixCiphertextFFI const *lwe_array_true,
    CudaRadixCiphertextFFI const *lwe_array_false,
-    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
+    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -70,8 +73,8 @@ __host__ void host_integer_radix_cmux_kb(
  }
  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
-      mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
-      2 * num_radix_blocks, params.message_modulus);
+      mem_ptr->condition_array, bsks, ksks, ms_noise_reduction_key,
+      mem_ptr->predicate_lut, 2 * num_radix_blocks, params.message_modulus);

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
@@ -88,7 +91,7 @@ __host__ void host_integer_radix_cmux_kb(
                       params.message_modulus, params.carry_modulus);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, &mem_true, bsks, ksks,
+      streams, lwe_array_out, &mem_true, bsks, ksks, ms_noise_reduction_key,
      mem_ptr->message_extract_lut, num_radix_blocks);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -41,7 +41,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  PUSH_RANGE("comparison")
  if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
    PANIC("Cuda error: input num radix blocks must be the same")
@@ -56,7 +57,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  case NE:
    host_integer_radix_equality_check_kb<uint64_t>(
        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
-        bsks, (uint64_t **)(ksks), num_radix_blocks);
+        bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
    break;
  case GT:
  case GE:
@@ -68,7 +69,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    host_integer_radix_difference_check_kb<uint64_t>(
        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
-        num_radix_blocks);
+        ms_noise_reduction_key, num_radix_blocks);
    break;
  case MAX:
  case MIN:
@@ -76,7 +77,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
      PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
    host_integer_radix_maxmin_kb<uint64_t>(
        CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
-        bsks, (uint64_t **)(ksks), num_radix_blocks);
+        bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
@@ -117,14 +118,16 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
 void cuda_integer_are_all_comparisons_block_true_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;

  host_integer_are_all_comparisons_block_true_kb<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
-      (uint64_t **)(ksks), num_radix_blocks);
+      (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
 }

 void cleanup_cuda_integer_are_all_comparisons_block_true(
@@ -159,14 +162,16 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
 void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  int_comparison_buffer<uint64_t> *buffer =
      (int_comparison_buffer<uint64_t> *)mem_ptr;

  host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
-      (uint64_t **)(ksks), num_radix_blocks);
+      (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
 }

 void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -61,7 +61,9 @@ __host__ void are_all_comparisons_block_true(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -156,7 +158,8 @@ __host__ void are_all_comparisons_block_true(
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
+          streams, lwe_array_out, accumulator, bsks, ksks,
+          ms_noise_reduction_key, lut, 1);
      // Reset max_value_lut_indexes before returning, otherwise if the lut is
      // reused the lut indexes will be wrong
      memset(is_max_value_lut->h_lut_indexes, 0,
@@ -173,7 +176,8 @@ __host__ void are_all_comparisons_block_true(
      return;
    } else {
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, tmp_out, accumulator, bsks, ksks, lut, num_chunks);
+          streams, tmp_out, accumulator, bsks, ksks, ms_noise_reduction_key,
+          lut, num_chunks);
    }
  }
 }
@@ -189,7 +193,9 @@ __host__ void is_at_least_one_comparisons_block_true(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input lwe dimensions must be the same")
@@ -243,12 +249,12 @@ __host__ void is_at_least_one_comparisons_block_true(
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
-          lut, 1);
+          ms_noise_reduction_key, lut, 1);
      return;
    } else {
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
-          bsks, ksks, lut, num_chunks);
+          bsks, ksks, ms_noise_reduction_key, lut, num_chunks);
    }
  }
 }
@@ -258,8 +264,9 @@ __host__ void host_compare_blocks_with_zero(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, int32_t num_radix_blocks,
-    int_radix_lut<Torus> *zero_comparison) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {

  if (num_radix_blocks == 0)
    return;
@@ -315,7 +322,8 @@ __host__ void host_compare_blocks_with_zero(
  }

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, sum, bsks, ksks, zero_comparison, num_sum_blocks);
+      streams, lwe_array_out, sum, bsks, ksks, ms_noise_reduction_key,
+      zero_comparison, num_sum_blocks);

  reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
 }
@@ -326,7 +334,9 @@ __host__ void host_integer_radix_equality_check_kb(
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -337,15 +347,16 @@ __host__ void host_integer_radix_equality_check_kb(
  auto comparisons = mem_ptr->tmp_block_comparisons;
  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
-      eq_buffer->operator_lut, num_radix_blocks,
+      ms_noise_reduction_key, eq_buffer->operator_lut, num_radix_blocks,
      eq_buffer->operator_lut->params.message_modulus);

  // This takes a Vec of blocks, where each block is either 0 or 1.
  //
  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true<Torus>(streams, lwe_array_out, comparisons,
-                                        mem_ptr, bsks, ksks, num_radix_blocks);
+  are_all_comparisons_block_true<Torus>(
+      streams, lwe_array_out, comparisons, mem_ptr, bsks, ksks,
+      ms_noise_reduction_key, num_radix_blocks);
 }

 template <typename Torus>
@@ -354,7 +365,9 @@ __host__ void compare_radix_blocks_kb(
    CudaRadixCiphertextFFI const *lwe_array_left,
    CudaRadixCiphertextFFI const *lwe_array_right,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -387,8 +400,8 @@ __host__ void compare_radix_blocks_kb(
  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, lwe_array_out, bsks, ksks, is_non_zero_lut,
-      num_radix_blocks);
+      streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
+      is_non_zero_lut, num_radix_blocks);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
@@ -401,13 +414,14 @@ __host__ void compare_radix_blocks_kb(
 // (inferior, equal, superior) to one single shortint block containing the
 // final sign
 template <typename Torus>
-__host__ void
-tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-                    CudaRadixCiphertextFFI *lwe_block_comparisons,
-                    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
-                    std::function<Torus(Torus)> sign_handler_f,
-                    void *const *bsks, Torus *const *ksks,
-                    uint32_t num_radix_blocks) {
+__host__ void tree_sign_reduction(
+    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI *lwe_block_comparisons,
+    int_tree_sign_reduction_buffer<Torus> *tree_buffer,
+    std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_block_comparisons->lwe_dimension)
    PANIC("Cuda error: input lwe dimensions must be the same")
@@ -440,7 +454,8 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
                       partial_block_count, message_modulus);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, x, y, bsks, ksks, inner_tree_leaf, partial_block_count >> 1);
+        streams, x, y, bsks, ksks, ms_noise_reduction_key, inner_tree_leaf,
+        partial_block_count >> 1);

    if ((partial_block_count % 2) != 0) {
      partial_block_count >>= 1;
@@ -486,7 +501,8 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,

  // Last leaf
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, y, bsks, ksks, last_lut, 1);
+      streams, lwe_array_out, y, bsks, ksks, ms_noise_reduction_key, last_lut,
+      1);
 }

 template <typename Torus>
@@ -496,7 +512,9 @@ __host__ void host_integer_radix_difference_check_kb(
    CudaRadixCiphertextFFI const *lwe_array_right,
    int_comparison_buffer<Torus> *mem_ptr,
    std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -536,7 +554,7 @@ __host__ void host_integer_radix_difference_check_kb(
    auto identity_lut = mem_ptr->identity_lut;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
-        identity_lut, 2 * packed_num_radix_blocks);
+        ms_noise_reduction_key, identity_lut, 2 * packed_num_radix_blocks);
  } else {
    as_radix_ciphertext_slice<Torus>(&lhs, lwe_array_left, 0,
                                     lwe_array_left->num_radix_blocks);
@@ -554,14 +572,16 @@ __host__ void host_integer_radix_difference_check_kb(
    // Compare packed blocks, or simply the total number of radix blocks in the
    // inputs
    compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
-                                   bsks, ksks, packed_num_radix_blocks);
+                                   bsks, ksks, ms_noise_reduction_key,
+                                   packed_num_radix_blocks);
    num_comparisons = packed_num_radix_blocks;
  } else {
    // Packing is possible
    if (carry_modulus >= message_modulus) {
      // Compare (num_radix_blocks - 2) / 2 packed blocks
      compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
-                                     bsks, ksks, packed_num_radix_blocks);
+                                     bsks, ksks, ms_noise_reduction_key,
+                                     packed_num_radix_blocks);

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
@@ -575,7 +595,7 @@ __host__ void host_integer_radix_difference_check_kb(
                                       num_radix_blocks - 1);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
-          bsks, ksks, identity_lut, 1);
+          bsks, ksks, ms_noise_reduction_key, identity_lut, 1);

      CudaRadixCiphertextFFI last_right_block_before_sign_block;
      as_radix_ciphertext_slice<Torus>(
@@ -588,7 +608,8 @@ __host__ void host_integer_radix_difference_check_kb(
                                       num_radix_blocks - 1);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, &last_right_block_before_sign_block,
-          &shifted_lwe_array_right, bsks, ksks, identity_lut, 1);
+          &shifted_lwe_array_right, bsks, ksks, ms_noise_reduction_key,
+          identity_lut, 1);

      CudaRadixCiphertextFFI shifted_comparisons;
      as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -596,7 +617,8 @@ __host__ void host_integer_radix_difference_check_kb(
                                       packed_num_radix_blocks + 1);
      compare_radix_blocks_kb<Torus>(
          streams, &shifted_comparisons, &last_left_block_before_sign_block,
-          &last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1);
+          &last_right_block_before_sign_block, mem_ptr, bsks, ksks,
+          ms_noise_reduction_key, 1);

      // Compare the sign block separately
      as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -610,14 +632,14 @@ __host__ void host_integer_radix_difference_check_kb(
                                       num_radix_blocks - 1, num_radix_blocks);
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, &shifted_comparisons, &last_left_block, &last_right_block,
-          bsks, ksks, mem_ptr->signed_lut, 1,
+          bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
          mem_ptr->signed_lut->params.message_modulus);
      num_comparisons = packed_num_radix_blocks + 2;

    } else {
-      compare_radix_blocks_kb<Torus>(streams, comparisons, lwe_array_left,
-                                     lwe_array_right, mem_ptr, bsks, ksks,
-                                     num_radix_blocks - 1);
+      compare_radix_blocks_kb<Torus>(
+          streams, comparisons, lwe_array_left, lwe_array_right, mem_ptr, bsks,
+          ksks, ms_noise_reduction_key, num_radix_blocks - 1);
      // Compare the sign block separately
      CudaRadixCiphertextFFI shifted_comparisons;
      as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
@@ -630,7 +652,7 @@ __host__ void host_integer_radix_difference_check_kb(
                                       num_radix_blocks - 1, num_radix_blocks);
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, &shifted_comparisons, &last_left_block, &last_right_block,
-          bsks, ksks, mem_ptr->signed_lut, 1,
+          bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
          mem_ptr->signed_lut->params.message_modulus);
      num_comparisons = num_radix_blocks;
    }
@@ -639,9 +661,9 @@ __host__ void host_integer_radix_difference_check_kb(
  // Reduces a vec containing radix blocks that encrypts a sign
  // (inferior, equal, superior) to one single radix block containing the
  // final sign
-  tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
-                             mem_ptr->diff_buffer->tree_buffer, reduction_lut_f,
-                             bsks, ksks, num_comparisons);
+  tree_sign_reduction<Torus>(
+      streams, lwe_array_out, comparisons, mem_ptr->diff_buffer->tree_buffer,
+      reduction_lut_f, bsks, ksks, ms_noise_reduction_key, num_comparisons);
 }

 template <typename Torus>
@@ -663,7 +685,9 @@ __host__ void host_integer_radix_maxmin_kb(
    CudaRadixCiphertextFFI const *lwe_array_left,
    CudaRadixCiphertextFFI const *lwe_array_right,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -677,12 +701,14 @@ __host__ void host_integer_radix_maxmin_kb(
  // Compute the sign
  host_integer_radix_difference_check_kb<Torus>(
      streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
-      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks);
+      mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
+      num_radix_blocks);

  // Selector
-  host_integer_radix_cmux_kb<Torus>(
-      streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
+  host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
+                                    mem_ptr->tmp_lwe_array_out, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, ms_noise_reduction_key);
 }

 template <typename Torus>
@@ -690,12 +716,15 @@ __host__ void host_integer_are_all_comparisons_block_true_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  are_all_comparisons_block_true<Torus>(streams, lwe_array_out, lwe_array_in,
-                                        mem_ptr, bsks, ksks, num_radix_blocks);
+  are_all_comparisons_block_true<Torus>(
+      streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
+      ms_noise_reduction_key, num_radix_blocks);
 }

 template <typename Torus>
@@ -703,12 +732,14 @@ __host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  // It returns a block encrypting 1 if all input blocks are 1
  // otherwise the block encrypts 0
-  is_at_least_one_comparisons_block_true<Torus>(streams, lwe_array_out,
-                                                lwe_array_in, mem_ptr, bsks,
-                                                ksks, num_radix_blocks);
+  is_at_least_one_comparisons_block_true<Torus>(
+      streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
+      ms_noise_reduction_key, num_radix_blocks);
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -344,7 +344,7 @@ host_integer_decompress(CudaStreams streams,
      execute_pbs_async<Torus, Torus>(
          active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
          lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
-          lut->lwe_indexes_in.get(), d_bsks, lut->buffer,
+          lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
          encryption_params.glwe_dimension,
          compression_params.small_lwe_dimension,
          encryption_params.polynomial_size, encryption_params.pbs_base_log,
@@ -359,13 +359,17 @@ host_integer_decompress(CudaStreams streams,
      std::vector<Torus *> lwe_trivial_indexes_vec =
          lut->lwe_trivial_indexes_vec;

-      lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
-          active_streams);
-
+      /// Make sure all data that should be on GPU 0 is indeed there
+      cuda_event_record(lut->event_scatter_in, streams.stream(0),
+                        streams.gpu_index(0));
+      for (int j = 1; j < active_streams.count(); j++) {
+        cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
+                               streams.gpu_index(j));
+      }
      /// With multiple GPUs we push to the vectors on each GPU then when we
      /// gather data to GPU 0 we can copy back to the original indexing
      multi_gpu_scatter_lwe_async<Torus>(
-          active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in.get(),
+          active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
          lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
          lut->active_streams.count(), num_blocks_to_decompress,
          compression_params.small_lwe_dimension + 1);
@@ -374,7 +378,7 @@ host_integer_decompress(CudaStreams streams,
      execute_pbs_async<Torus, Torus>(
          active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
          lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
-          lwe_trivial_indexes_vec, d_bsks, lut->buffer,
+          lwe_trivial_indexes_vec, d_bsks, nullptr, lut->buffer,
          encryption_params.glwe_dimension,
          compression_params.small_lwe_dimension,
          encryption_params.polynomial_size, encryption_params.pbs_base_log,
@@ -391,8 +395,15 @@ host_integer_decompress(CudaStreams streams,

      /// Synchronize all GPUs
      // other gpus record their events
-      lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
-          active_streams);
+      for (int j = 1; j < active_streams.count(); j++) {
+        cuda_event_record(lut->event_scatter_out[j], active_streams.stream(j),
+                          active_streams.gpu_index(j));
+      }
+      // GPU 0 waits for all
+      for (int j = 1; j < active_streams.count(); j++) {
+        cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
+                               streams.gpu_index(0));
+      }
    }
  } else {
    static_assert(std::is_same_v<Torus, __uint128_t>,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cu
@@ -24,13 +24,14 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  PUSH_RANGE("div")
  auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;

-  host_integer_div_rem_kb<uint64_t>(CudaStreams(streams), quotient, remainder,
-                                    numerator, divisor, is_signed, bsks,
-                                    (uint64_t **)(ksks), mem);
+  host_integer_div_rem_kb<uint64_t>(
+      CudaStreams(streams), quotient, remainder, numerator, divisor, is_signed,
+      bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem);
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -4,7 +4,6 @@
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer/abs.cuh"
-#include "integer/cast.cuh"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
 #include "integer/integer_utilities.h"
@@ -32,455 +31,14 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
  return size_tracker;
 }

-template <typename Torus>
-__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
-    CudaStreams streams, CudaRadixCiphertextFFI *quotient,
-    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
-    CudaRadixCiphertextFFI const *divisor, void *const *bsks,
-    uint64_t *const *ksks, unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
-
-  if (streams.count() < 4) {
-    PANIC("GPU count should be greater than 4 when using div_rem_2_2");
-  }
-  if (mem_ptr->params.message_modulus != 4 ||
-      mem_ptr->params.carry_modulus != 4) {
-    PANIC("Only message_modulus == 4 && carry_modulus == 4 parameters are "
-          "supported");
-  }
-
-  // alias
-  auto radix_params = mem_ptr->params;
-  auto num_blocks = quotient->num_radix_blocks;
-  auto remainder_gpu_0 = remainder;
-  auto remainder_gpu_1 = mem_ptr->remainder_gpu_1;
-  auto remainder_gpu_2 = mem_ptr->remainder_gpu_2;
-  auto remainder_gpu_3 = mem_ptr->remainder_gpu_3;
-  auto divisor_gpu_0 = divisor;
-  auto divisor_gpu_1 = mem_ptr->divisor_gpu_1;
-  auto divisor_gpu_2 = mem_ptr->divisor_gpu_2;
-
-  // gpu[0] -> gpu[0]
-  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                     remainder_gpu_0, numerator);
-
-  // gpu[0] -> gpu[1]
-  copy_radix_ciphertext_async<Torus>(streams.stream(1), streams.gpu_index(1),
-                                     remainder_gpu_1, numerator);
-  // gpu[0] -> gpu[1]
-  copy_radix_ciphertext_async<Torus>(streams.stream(1), streams.gpu_index(1),
-                                     divisor_gpu_1, divisor);
-  // gpu[0] -> gpu[2]
-  copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
-                                     remainder_gpu_2, numerator);
-  // gpu[0] -> gpu[3]
-  copy_radix_ciphertext_async<Torus>(streams.stream(3), streams.gpu_index(3),
-                                     remainder_gpu_3, numerator);
-  // gpu[0] -> gpu[2]
-  copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
-                                     divisor_gpu_2, divisor);
-
-  // gpu[0]
-  set_zero_radix_ciphertext_slice_async<Torus>(
-      streams.stream(0), streams.gpu_index(0), quotient, 0, num_blocks);
-  quotient->num_radix_blocks = 0;
-
-  // Copy divisor_gpu_2 into d1 gpu[2] -> gpu[2]
-  mem_ptr->d1->num_radix_blocks = divisor_gpu_2->num_radix_blocks;
-  copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
-                                     mem_ptr->d1, divisor_gpu_2);
-
-  // Computes 2*d by extending and shifting on gpu[1]
-  host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
-      mem_ptr->d2, divisor_gpu_1, streams.get_ith(1));
-  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-      streams.get_ith(1), mem_ptr->d2, 1, mem_ptr->shift_mem, &bsks[1],
-      &ksks[1], mem_ptr->d2->num_radix_blocks);
-
-  // Computes 3*d = 4*d - d using block shift and subtraction on gpu[0]
-  host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
-      mem_ptr->tmp_gpu_0, divisor_gpu_0, streams.get_ith(0));
-  host_radix_blocks_rotate_right<Torus>(streams.get_ith(0), mem_ptr->d3,
-                                        mem_ptr->tmp_gpu_0, 1,
-                                        mem_ptr->tmp_gpu_0->num_radix_blocks);
-  set_zero_radix_ciphertext_slice_async<Torus>(
-      streams.stream(0), streams.gpu_index(0), mem_ptr->d3, 0, 1);
-  host_sub_and_propagate_single_carry(streams.get_ith(0), mem_ptr->d3,
-                                      mem_ptr->tmp_gpu_0, nullptr, nullptr,
-                                      mem_ptr->sub_and_propagate_mem, &bsks[0],
-                                      &ksks[0], outputFlag::FLAG_NONE, 0);
-
-  // +-----------------+-----------------+-----------------+-----------------+
-  // |     GPU[0]      |     GPU[1]      |     GPU[2]      |     GPU[3]      |
-  // +-----------------+-----------------+-----------------+-----------------+
-  // | d3              | d2              | d1              | -               |
-  // | low3            | low2            | low1            | -               |
-  // | rem3            | rem2            | rem1            | rem0            |
-  // | sub_result_1    | sub_result_2    | sub_result_3    | -               |
-  // | s_1_overflowed  | s_2_overflowed  | s_3_overflowed  | -               |
-  // | cmp_1           | cmp_2           | cmp_3           | -               |
-  // | r3              | r2              | r1              | -               |
-  // | o3              | o2              | o1              | -               |
-  // | c3 = !o3        | c2 = !o2 + o3   | c1 = !o1 + o2   | c0 = o1         |
-  // | z_o_not_1_lut_1 | z_o_not_2_lut_1 | z_o_not_2_lut_2 | z_o_not_1_lut_2 |
-  // +-----------------+-----------------+-----------------+-----------------+
-  for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
-
-    uint32_t slice_len = num_blocks - block_index;
-
-    auto init_low_rem_f =
-        [&](CudaRadixCiphertextFFI *low, CudaRadixCiphertextFFI *xd,
-            CudaRadixCiphertextFFI *rem, CudaRadixCiphertextFFI *cur_remainder,
-            size_t gpu_index, bool init_low) {
-          rem->num_radix_blocks = slice_len;
-          if (init_low) {
-            low->num_radix_blocks = slice_len;
-            copy_radix_ciphertext_slice_async<Torus>(
-                streams.stream(gpu_index), streams.gpu_index(gpu_index), low, 0,
-                slice_len, xd, 0, slice_len);
-          }
-          copy_radix_ciphertext_slice_async<Torus>(
-              streams.stream(gpu_index), streams.gpu_index(gpu_index), rem, 0,
-              slice_len, cur_remainder, block_index, num_blocks);
-        };
-
-    init_low_rem_f(nullptr, nullptr, mem_ptr->rem0, remainder_gpu_3, 3, false);
-    init_low_rem_f(mem_ptr->low1, mem_ptr->d1, mem_ptr->rem1, remainder_gpu_2,
-                   2, true);
-    init_low_rem_f(mem_ptr->low2, mem_ptr->d2, mem_ptr->rem2, remainder_gpu_1,
-                   1, true);
-    init_low_rem_f(mem_ptr->low3, mem_ptr->d3, mem_ptr->rem3, remainder_gpu_0,
-                   0, true);
-
-    auto sub_result_f = [&](CudaStreams streams, size_t gpu_index,
-                            CudaRadixCiphertextFFI *sub_result,
-                            CudaRadixCiphertextFFI *sub_overflowed,
-                            int_borrow_prop_memory<Torus> *overflow_sub_mem,
-                            CudaRadixCiphertextFFI *low,
-                            CudaRadixCiphertextFFI *rem, Torus *first_indexes,
-                            Torus *second_indexes, Torus *scalar_indexes) {
-      uint32_t compute_overflow = 1;
-      uint32_t uses_input_borrow = 0;
-      sub_result->num_radix_blocks = low->num_radix_blocks;
-      overflow_sub_mem->update_lut_indexes(
-          streams.get_ith(gpu_index), first_indexes, second_indexes,
-          scalar_indexes, rem->num_radix_blocks);
-      host_integer_overflowing_sub<uint64_t>(
-          streams.get_ith(gpu_index), sub_result, rem, low, sub_overflowed,
-          (const CudaRadixCiphertextFFI *)nullptr, overflow_sub_mem,
-          &bsks[gpu_index], &ksks[gpu_index], compute_overflow,
-          uses_input_borrow);
-    };
-
-    auto cmp_f = [&](CudaStreams streams, size_t gpu_index,
-                     CudaRadixCiphertextFFI *out_boolean_block,
-                     CudaRadixCiphertextFFI *comparison_blocks,
-                     CudaRadixCiphertextFFI *d,
-                     int_comparison_buffer<Torus> *comparison_buffer) {
-      CudaRadixCiphertextFFI *d_msb = new CudaRadixCiphertextFFI;
-      uint32_t slice_start = num_blocks - block_index;
-      uint32_t slice_end = d->num_radix_blocks;
-      as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
-      comparison_blocks->num_radix_blocks = d_msb->num_radix_blocks;
-      if (d_msb->num_radix_blocks == 0) {
-        cuda_memset_async(
-            (Torus *)out_boolean_block->ptr, 0,
-            sizeof(Torus) * (out_boolean_block->lwe_dimension + 1),
-            streams.stream(gpu_index), streams.gpu_index(gpu_index));
-      } else {
-        host_compare_blocks_with_zero<Torus>(
-            streams.get_ith(gpu_index), comparison_blocks, d_msb,
-            comparison_buffer, &bsks[gpu_index], &ksks[gpu_index],
-            d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
-        are_all_comparisons_block_true(
-            streams.get_ith(gpu_index), out_boolean_block, comparison_blocks,
-            comparison_buffer, &bsks[gpu_index], &ksks[gpu_index],
-            comparison_blocks->num_radix_blocks);
-
-        host_negation<Torus>(
-            streams.stream(gpu_index), streams.gpu_index(gpu_index),
-            (Torus *)out_boolean_block->ptr, (Torus *)out_boolean_block->ptr,
-            radix_params.big_lwe_dimension, 1);
-
-        // we calculate encoding because this block works only for
-        // message_modulus = 4 and carry_modulus = 4.
-        const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
-        host_addition_plaintext_scalar<Torus>(
-            streams.stream(gpu_index), streams.gpu_index(gpu_index),
-            (Torus *)out_boolean_block->ptr, (Torus *)out_boolean_block->ptr,
-            encoded_scalar, radix_params.big_lwe_dimension, 1);
-      }
-      delete d_msb;
-    };
-
-    for (uint j = 0; j < 3; j++) {
-      cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
-    }
-
-    size_t indexes_id = mem_ptr->rem3->num_radix_blocks - 1;
-    sub_result_f(streams, 0, mem_ptr->sub_result_1, mem_ptr->sub_1_overflowed,
-                 mem_ptr->overflow_sub_mem_1, mem_ptr->low3, mem_ptr->rem3,
-                 mem_ptr->first_indexes_for_overflow_sub_gpu_0[indexes_id],
-                 mem_ptr->second_indexes_for_overflow_sub_gpu_0[indexes_id],
-                 mem_ptr->scalars_for_overflow_sub_gpu_0[indexes_id]);
-    sub_result_f(streams, 1, mem_ptr->sub_result_2, mem_ptr->sub_2_overflowed,
-                 mem_ptr->overflow_sub_mem_2, mem_ptr->low2, mem_ptr->rem2,
-                 mem_ptr->first_indexes_for_overflow_sub_gpu_1[indexes_id],
-                 mem_ptr->second_indexes_for_overflow_sub_gpu_1[indexes_id],
-                 mem_ptr->scalars_for_overflow_sub_gpu_1[indexes_id]);
-    sub_result_f(streams, 2, mem_ptr->sub_result_3, mem_ptr->sub_3_overflowed,
-                 mem_ptr->overflow_sub_mem_3, mem_ptr->low1, mem_ptr->rem1,
-                 mem_ptr->first_indexes_for_overflow_sub_gpu_2[indexes_id],
-                 mem_ptr->second_indexes_for_overflow_sub_gpu_2[indexes_id],
-                 mem_ptr->scalars_for_overflow_sub_gpu_2[indexes_id]);
-
-    cmp_f(mem_ptr->sub_streams_1, 0, mem_ptr->cmp_1,
-          mem_ptr->comparison_blocks_1, mem_ptr->d3,
-          mem_ptr->comparison_buffer_1);
-    cmp_f(mem_ptr->sub_streams_1, 1, mem_ptr->cmp_2,
-          mem_ptr->comparison_blocks_2, mem_ptr->d2,
-          mem_ptr->comparison_buffer_2);
-    cmp_f(mem_ptr->sub_streams_1, 2, mem_ptr->cmp_3,
-          mem_ptr->comparison_blocks_3, mem_ptr->d1,
-          mem_ptr->comparison_buffer_3);
-
-    for (uint j = 0; j < 3; j++) {
-      cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
-      cuda_synchronize_stream(mem_ptr->sub_streams_1.stream(j),
-                              mem_ptr->sub_streams_1.gpu_index(j));
-    }
-
-    auto r1 = mem_ptr->sub_result_3;
-    auto r2 = mem_ptr->sub_result_2;
-    auto r3 = mem_ptr->sub_result_1;
-    auto o1 = mem_ptr->sub_3_overflowed;
-    auto o2 = mem_ptr->sub_2_overflowed;
-    auto o3 = mem_ptr->sub_1_overflowed;
-
-    // used as a bitor
-    host_integer_radix_bitop_kb(streams.get_ith(0), o3, o3, mem_ptr->cmp_1,
-                                mem_ptr->bitor_mem_1, &bsks[0], &ksks[0]);
-    // used as a bitor
-    host_integer_radix_bitop_kb(streams.get_ith(1), o2, o2, mem_ptr->cmp_2,
-                                mem_ptr->bitor_mem_2, &bsks[1], &ksks[1]);
-    // used as a bitor
-    host_integer_radix_bitop_kb(streams.get_ith(2), o1, o1, mem_ptr->cmp_3,
-                                mem_ptr->bitor_mem_3, &bsks[2], &ksks[2]);
-
-    // cmp_1, cmp_2, cmp_3 are not needed anymore, we can reuse them as c3,
-    // c2, c1. c0 is allocated on gpu[3], we take it from mem_ptr.
-    auto c3 = mem_ptr->cmp_1;
-    auto c2 = mem_ptr->cmp_2;
-    auto c1 = mem_ptr->cmp_3;
-    auto c0 = mem_ptr->c0;
-
-    // move all `o` so that each gpu has required `o` for calculating `c`
-    auto o3_gpu_1 = mem_ptr->tmp_gpu_1;
-    auto o2_gpu_2 = mem_ptr->tmp_gpu_2;
-    auto o1_gpu_3 = mem_ptr->tmp_gpu_3;
-
-    o3_gpu_1->num_radix_blocks = o3->num_radix_blocks;
-    o2_gpu_2->num_radix_blocks = o2->num_radix_blocks;
-    o1_gpu_3->num_radix_blocks = o1->num_radix_blocks;
-
-    for (uint j = 0; j < 4; j++) {
-      cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
-    }
-
-    copy_radix_ciphertext_async<Torus>(streams.stream(1), streams.gpu_index(1),
-                                       o3_gpu_1, o3);
-    copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
-                                       o2_gpu_2, o2);
-    copy_radix_ciphertext_async<Torus>(streams.stream(3), streams.gpu_index(3),
-                                       o1_gpu_3, o1);
-
-    // c3 = !o3
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), c3, 0, 1, o3, 0, 1);
-    host_negation<Torus>(streams.stream(0), streams.gpu_index(0),
-                         (Torus *)c3->ptr, (Torus *)c3->ptr,
-                         radix_params.big_lwe_dimension, 1);
-    const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
-    host_addition_plaintext_scalar<Torus>(
-        streams.stream(0), streams.gpu_index(0), (Torus *)c3->ptr,
-        (Torus *)c3->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
-
-    // c2 = !o2 + o3
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(1), streams.gpu_index(1), c2, 0, 1, o2, 0, 1);
-    host_negation<Torus>(streams.stream(1), streams.gpu_index(1),
-                         (Torus *)c2->ptr, (Torus *)c2->ptr,
-                         radix_params.big_lwe_dimension, 1);
-    host_addition_plaintext_scalar<Torus>(
-        streams.stream(1), streams.gpu_index(1), (Torus *)c2->ptr,
-        (Torus *)c2->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
-    host_addition<Torus>(streams.stream(1), streams.gpu_index(1), c2, c2,
-                         o3_gpu_1, 1, 4, 4);
-
-    // c1 = !o1 + o2
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(2), streams.gpu_index(2), c1, 0, 1, o1, 0, 1);
-    host_negation<Torus>(streams.stream(2), streams.gpu_index(2),
-                         (Torus *)c1->ptr, (Torus *)c1->ptr,
-                         radix_params.big_lwe_dimension, 1);
-    host_addition_plaintext_scalar<Torus>(
-        streams.stream(2), streams.gpu_index(2), (Torus *)c1->ptr,
-        (Torus *)c1->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
-    host_addition<Torus>(streams.stream(2), streams.gpu_index(2), c1, c1,
-                         o2_gpu_2, 1, 4, 4);
-
-    // c0 = o1 (direct copy)
-    copy_radix_ciphertext_slice_async<Torus>(streams.stream(3),
-                                             streams.gpu_index(3), mem_ptr->c0,
-                                             0, 1, o1_gpu_3, 0, 1);
-
-    auto conditional_update = [&](CudaStreams streams, size_t gpu_index,
-                                  CudaRadixCiphertextFFI *cx,
-                                  CudaRadixCiphertextFFI *rx,
-                                  int_radix_lut<Torus> *lut, Torus factor) {
-      auto rx_list = to_lwe_ciphertext_list(rx);
-      host_cleartext_multiplication<Torus>(streams.stream(gpu_index),
-                                           streams.gpu_index(gpu_index),
-                                           (Torus *)rx->ptr, &rx_list, factor);
-      host_add_the_same_block_to_all_blocks<Torus>(streams.stream(gpu_index),
-                                                   streams.gpu_index(gpu_index),
-                                                   rx, rx, cx, 4, 4);
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams.get_ith(gpu_index), rx, rx, &bsks[gpu_index],
-          &ksks[gpu_index], lut, rx->num_radix_blocks);
-    };
-
-    for (uint j = 0; j < 4; j++) {
-      cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
-      cuda_synchronize_stream(mem_ptr->sub_streams_1.stream(j),
-                              mem_ptr->sub_streams_1.gpu_index(j));
-    }
-
-    conditional_update(streams, 0, c3, r3, mem_ptr->zero_out_if_not_1_lut_1, 2);
-    conditional_update(streams, 1, c2, r2, mem_ptr->zero_out_if_not_2_lut_1, 3);
-    conditional_update(streams, 2, c1, r1, mem_ptr->zero_out_if_not_2_lut_2, 3);
-    conditional_update(streams, 3, c0, mem_ptr->rem0,
-                       mem_ptr->zero_out_if_not_1_lut_2, 2);
-
-    // calculate quotient bits GPU[2]
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem_ptr->sub_streams_1.get_ith(2), mem_ptr->q1, c1, &bsks[2], &ksks[2],
-        mem_ptr->quotient_lut_1, 1);
-    // calculate quotient bits GPU[1]
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem_ptr->sub_streams_1.get_ith(1), mem_ptr->q2, c2, &bsks[1], &ksks[1],
-        mem_ptr->quotient_lut_2, 1);
-    // calculate quotient bits GPU[0]
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem_ptr->sub_streams_1.get_ith(0), mem_ptr->q3, c3, &bsks[0], &ksks[0],
-        mem_ptr->quotient_lut_3, 1);
-
-    for (uint j = 0; j < 4; j++) {
-      cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
-      cuda_synchronize_stream(mem_ptr->sub_streams_1.stream(j),
-                              mem_ptr->sub_streams_1.gpu_index(j));
-    }
-
-    // We need to accumulate rem, r1, r2, and r3, but each buffer currently
-    // lives on a different GPU. To gather them on GPU[0], we’ll **reuse**
-    // buffers already allocated on GPU[0]. At this point, the contents of rem3,
-    // tmp_gpu_0, and low3 are no longer needed, so it’s safe to repurpose them.
-    // Aliases for the GPU[0] destinations:
-    auto r3_gpu_0 = r3;                 // reuse: destination for r3 on GPU[0]
-    auto r2_gpu_0 = mem_ptr->tmp_gpu_0; // reuse: destination for r2 on GPU[0]
-    auto r1_gpu_0 = mem_ptr->low3;      // reuse: destination for r1 on GPU[0]
-    auto rem_gpu_0 = mem_ptr->rem3;     // reuse: destination for rem on GPU[0]
-
-    r2_gpu_0->num_radix_blocks = r2->num_radix_blocks;
-    // r3 is already on GPU 0, so no need to copy it.
-
-    // Copy r2 from GPU[1] to GPU[0]
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       r2_gpu_0, r2);
-
-    // Copy r1 from GPU[2] to GPU[0]
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       r1_gpu_0, r1);
-
-    // Copy rem from GPU[3] to GPU[0]
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       rem_gpu_0, mem_ptr->rem0);
-
-    // We do the same to accumulate quotient bits q1, q2 and q3. q3 is already
-    // on GPU[0]. To copy q1 and q2 we will reuse buffers allocated on GPU[0]:
-    // sub_1_overflowed and cmp_1.
-    auto q3_gpu_0 = mem_ptr->q3; // q3 is already on GPU[0]
-    auto q2_gpu_0 =
-        mem_ptr->sub_1_overflowed;  // reuse: destination for q2 on GPU[0]
-    auto q1_gpu_0 = mem_ptr->cmp_1; // reuse: destination for q1 on GPU[0]
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       q2_gpu_0, mem_ptr->q2);
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       q1_gpu_0, mem_ptr->q1);
-
-    host_addition<Torus>(streams.stream(0), streams.gpu_index(0), rem_gpu_0,
-                         rem_gpu_0, r3_gpu_0, rem_gpu_0->num_radix_blocks, 4,
-                         4);
-    host_addition<Torus>(streams.stream(0), streams.gpu_index(0), rem_gpu_0,
-                         rem_gpu_0, r2_gpu_0, rem_gpu_0->num_radix_blocks, 4,
-                         4);
-    host_addition<Torus>(streams.stream(0), streams.gpu_index(0), rem_gpu_0,
-                         rem_gpu_0, r1_gpu_0, rem_gpu_0->num_radix_blocks, 4,
-                         4);
-
-    host_addition<Torus>(streams.stream(0), streams.gpu_index(0), q3_gpu_0,
-                         q3_gpu_0, q2_gpu_0, 1, 4, 4);
-    host_addition<Torus>(streams.stream(0), streams.gpu_index(0), q3_gpu_0,
-                         q3_gpu_0, q1_gpu_0, 1, 4, 4);
-
-    streams.synchronize();
-
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, rem_gpu_0, rem_gpu_0, bsks, ksks,
-        mem_ptr->message_extract_lut_1, rem_gpu_0->num_radix_blocks);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem_ptr->sub_streams_1, q3_gpu_0, q3_gpu_0, bsks, ksks,
-        mem_ptr->message_extract_lut_2, 1);
-    streams.synchronize();
-    mem_ptr->sub_streams_1.synchronize();
-
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), remainder_gpu_0, block_index,
-        remainder_gpu_0->num_radix_blocks, rem_gpu_0, 0,
-        rem_gpu_0->num_radix_blocks);
-    insert_block_in_radix_ciphertext_async<Torus>(
-        streams.stream(0), streams.gpu_index(0), q3_gpu_0, quotient, 0);
-
-    // Copy remainder_gpu_0 to all other GPUs
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       remainder_gpu_1, remainder_gpu_0);
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       remainder_gpu_2, remainder_gpu_0);
-    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
-                                       remainder_gpu_3, remainder_gpu_0);
-
-    // non boolean blocks
-    for (int block_id = 0; block_id < slice_len; block_id++) {
-      mem_ptr->sub_result_1->degrees[block_id] =
-          radix_params.message_modulus - 1;
-      mem_ptr->rem0->degrees[block_id] = radix_params.message_modulus - 1;
-    }
-
-    // boolean blocks
-    mem_ptr->cmp_3->degrees[0] = 0;
-    mem_ptr->cmp_2->degrees[0] = 0;
-    mem_ptr->cmp_1->degrees[0] = 0;
-    mem_ptr->cmp_3->noise_levels[0] = 0;
-
-    streams.synchronize();
-  }
-}
-
 template <typename Torus>
 __host__ void host_unsigned_integer_div_rem_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, void *const *bsks,
-    uint64_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
+    uint64_t *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {

  if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
      remainder->num_radix_blocks != divisor->num_radix_blocks ||
@@ -490,14 +48,6 @@ __host__ void host_unsigned_integer_div_rem_kb(
      remainder->lwe_dimension != divisor->lwe_dimension ||
      remainder->lwe_dimension != quotient->lwe_dimension)
    PANIC("Cuda error: input and output lwe dimension must be equal")
-
-  if (mem_ptr->params.message_modulus == 4 &&
-      mem_ptr->params.carry_modulus == 4 && streams.count() >= 4) {
-    host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
-        streams, quotient, remainder, numerator, divisor, bsks, ksks,
-        mem_ptr->div_rem_2_2_mem);
-    return;
-  }
  auto radix_params = mem_ptr->params;
  auto num_blocks = quotient->num_radix_blocks;

@@ -596,7 +146,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
          interesting_divisor->num_radix_blocks);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, &last_interesting_divisor_block,
-          &last_interesting_divisor_block, bsks, ksks,
+          &last_interesting_divisor_block, bsks, ksks, ms_noise_reduction_key,
          mem_ptr->masking_luts_1[shifted_mask], 1);
    }; // trim_last_interesting_divisor_bits

@@ -623,7 +173,7 @@ __host__ void host_unsigned_integer_div_rem_kb(

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
-          mem_ptr->masking_luts_2[shifted_mask], 1);
+          ms_noise_reduction_key, mem_ptr->masking_luts_2[shifted_mask], 1);
    }; // trim_first_divisor_ms_bits

    // This does
@@ -645,7 +195,7 @@ __host__ void host_unsigned_integer_div_rem_kb(

      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
          streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
-          interesting_remainder1->num_radix_blocks);
+          ms_noise_reduction_key, interesting_remainder1->num_radix_blocks);

      reset_radix_ciphertext_blocks(mem_ptr->tmp_radix,
                                    interesting_remainder1->num_radix_blocks);
@@ -674,7 +224,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
          streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
-          interesting_remainder2->num_radix_blocks);
+          ms_noise_reduction_key, interesting_remainder2->num_radix_blocks);
    }; // left_shift_interesting_remainder2

    streams.synchronize();
@@ -747,7 +297,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
          streams, new_remainder, merged_interesting_remainder,
          interesting_divisor, subtraction_overflowed,
          (const CudaRadixCiphertextFFI *)nullptr, mem_ptr->overflow_sub_mem,
-          bsks, ksks, compute_borrow, uses_input_borrow);
+          bsks, ksks, ms_noise_reduction_key, compute_borrow,
+          uses_input_borrow);
    };

    // fills:
@@ -765,12 +316,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
        // So we can skip some stuff
        host_compare_blocks_with_zero<Torus>(
            streams, mem_ptr->tmp_1, trivial_blocks, mem_ptr->comparison_buffer,
-            bsks, ksks, trivial_blocks->num_radix_blocks,
+            bsks, ksks, ms_noise_reduction_key,
+            trivial_blocks->num_radix_blocks,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);

        is_at_least_one_comparisons_block_true<Torus>(
            streams, at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
-            mem_ptr->comparison_buffer, bsks, ksks,
+            mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
            mem_ptr->tmp_1->num_radix_blocks);
      }
    };
@@ -783,7 +335,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, cleaned_merged_interesting_remainder,
          cleaned_merged_interesting_remainder, bsks, ksks,
-          mem_ptr->message_extract_lut_1,
+          ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
          cleaned_merged_interesting_remainder->num_radix_blocks);
    };

@@ -821,7 +373,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, cleaned_merged_interesting_remainder,
              cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
-              ksks, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
+              ksks, ms_noise_reduction_key,
+              mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
              cleaned_merged_interesting_remainder->num_radix_blocks, factor);
        };

@@ -829,7 +382,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
        [&](CudaStreams streams) {
          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
-              ksks, mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
+              ksks, ms_noise_reduction_key,
+              mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
              new_remainder->num_radix_blocks, factor);
        };

@@ -838,6 +392,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, mem_ptr->did_not_overflow, subtraction_overflowed,
          at_least_one_upper_block_is_non_zero, bsks, ksks,
+          ms_noise_reduction_key,
          mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);
@@ -896,10 +451,10 @@ __host__ void host_unsigned_integer_div_rem_kb(

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
-      mem_ptr->message_extract_lut_1, num_blocks);
+      ms_noise_reduction_key, mem_ptr->message_extract_lut_1, num_blocks);
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
-      mem_ptr->message_extract_lut_2, num_blocks);
+      ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);

  mem_ptr->sub_streams_1.synchronize();
  mem_ptr->sub_streams_2.synchronize();
@@ -910,7 +465,9 @@ __host__ void host_integer_div_rem_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *quotient,
    CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
-    uint64_t *const *ksks, int_div_rem_memory<uint64_t> *int_mem_ptr) {
+    uint64_t *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int_div_rem_memory<uint64_t> *int_mem_ptr) {
  if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
      remainder->num_radix_blocks != divisor->num_radix_blocks ||
      remainder->num_radix_blocks != quotient->num_radix_blocks)
@@ -935,16 +492,19 @@ __host__ void host_integer_div_rem_kb(
    streams.synchronize();

    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
-                               bsks, ksks, int_mem_ptr->abs_mem_1, true);
+                               bsks, ksks, ms_noise_reduction_key,
+                               int_mem_ptr->abs_mem_1, true);
    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
-                               bsks, ksks, int_mem_ptr->abs_mem_2, true);
+                               bsks, ksks, ms_noise_reduction_key,
+                               int_mem_ptr->abs_mem_2, true);

    int_mem_ptr->sub_streams_1.synchronize();
    int_mem_ptr->sub_streams_2.synchronize();

    host_unsigned_integer_div_rem_kb<Torus>(
        int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
-        positive_divisor, bsks, ksks, int_mem_ptr->unsigned_mem);
+        positive_divisor, bsks, ksks, ms_noise_reduction_key,
+        int_mem_ptr->unsigned_mem);

    CudaRadixCiphertextFFI numerator_sign;
    as_radix_ciphertext_slice<Torus>(&numerator_sign, numerator, num_blocks - 1,
@@ -954,7 +514,7 @@ __host__ void host_integer_div_rem_kb(
                                     num_blocks);
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
-        &numerator_sign, &divisor_sign, bsks, ksks,
+        &numerator_sign, &divisor_sign, bsks, ksks, ms_noise_reduction_key,
        int_mem_ptr->compare_signed_bits_lut, 1,
        int_mem_ptr->compare_signed_bits_lut->params.message_modulus);

@@ -967,36 +527,37 @@ __host__ void host_integer_div_rem_kb(

    uint32_t requested_flag = outputFlag::FLAG_NONE;
    uint32_t uses_carry = 0;
-    host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1,
-                                       int_mem_ptr->negated_quotient, nullptr,
-                                       nullptr, int_mem_ptr->scp_mem_1, bsks,
-                                       ksks, requested_flag, uses_carry);
+    host_propagate_single_carry<Torus>(
+        int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, nullptr,
+        nullptr, int_mem_ptr->scp_mem_1, bsks, ksks, ms_noise_reduction_key,
+        requested_flag, uses_carry);

    host_integer_radix_negation<Torus>(
        int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
        radix_params.message_modulus, radix_params.carry_modulus, num_blocks);

-    host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_2,
-                                       int_mem_ptr->negated_remainder, nullptr,
-                                       nullptr, int_mem_ptr->scp_mem_2, bsks,
-                                       ksks, requested_flag, uses_carry);
+    host_propagate_single_carry<Torus>(
+        int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, nullptr,
+        nullptr, int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
+        requested_flag, uses_carry);

-    host_integer_radix_cmux_kb<Torus>(
-        int_mem_ptr->sub_streams_1, quotient,
-        int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
-        quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks);
+    host_integer_radix_cmux_kb<Torus>(int_mem_ptr->sub_streams_1, quotient,
+                                      int_mem_ptr->sign_bits_are_different,
+                                      int_mem_ptr->negated_quotient, quotient,
+                                      int_mem_ptr->cmux_quotient_mem, bsks,
+                                      ksks, ms_noise_reduction_key);

    host_integer_radix_cmux_kb<Torus>(
        int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
        int_mem_ptr->negated_remainder, remainder,
-        int_mem_ptr->cmux_remainder_mem, bsks, ksks);
+        int_mem_ptr->cmux_remainder_mem, bsks, ksks, ms_noise_reduction_key);

    int_mem_ptr->sub_streams_1.synchronize();
    int_mem_ptr->sub_streams_2.synchronize();
  } else {
-    host_unsigned_integer_div_rem_kb<Torus>(streams, quotient, remainder,
-                                            numerator, divisor, bsks, ksks,
-                                            int_mem_ptr->unsigned_mem);
+    host_unsigned_integer_div_rem_kb<Torus>(
+        streams, quotient, remainder, numerator, divisor, bsks, ksks,
+        ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
@@ -29,12 +29,13 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
 void cuda_integer_count_of_consecutive_bits_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
+    void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {

  host_integer_count_of_consecutive_bits<uint64_t>(
      CudaStreams(streams), output_ct, input_ct,
      (int_count_of_consecutive_bits_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks);
+      (uint64_t **)ksks, ms_noise_reduction_key);
 }

 void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
@@ -80,12 +81,13 @@ void cuda_integer_ilog2_kb_64(
    CudaRadixCiphertextFFI const *trivial_ct_neg_n,
    CudaRadixCiphertextFFI const *trivial_ct_2,
    CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {

  host_integer_ilog2<uint64_t>(
      CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
      trivial_ct_m_minus_1_block, (int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks);
+      (uint64_t **)ksks, ms_noise_reduction_key);
 }

 void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cuh
@@ -9,12 +9,14 @@ template <typename Torus>
 __host__ void host_integer_prepare_count_of_consecutive_bits(
    CudaStreams streams, CudaRadixCiphertextFFI *ciphertext,
    int_prepare_count_of_consecutive_bits_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks) {
+    void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto tmp = mem_ptr->tmp_ct;

  host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
-                                      mem_ptr->univ_lut_mem, ksks, bsks);
+                                      mem_ptr->univ_lut_mem, ksks,
+                                      ms_noise_reduction_key, bsks);

  if (mem_ptr->direction == Leading) {
    host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
@@ -22,7 +24,7 @@ __host__ void host_integer_prepare_count_of_consecutive_bits(

  host_compute_prefix_sum_hillis_steele<uint64_t>(
      streams, ciphertext, tmp, mem_ptr->biv_lut_mem, bsks, ksks,
-      ciphertext->num_radix_blocks);
+      ms_noise_reduction_key, ciphertext->num_radix_blocks);
 }

 template <typename Torus>
@@ -46,7 +48,8 @@ __host__ void host_integer_count_of_consecutive_bits(
    CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
    CudaRadixCiphertextFFI const *input_ct,
    int_count_of_consecutive_bits_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto params = mem_ptr->params;
  auto ct_prepared = mem_ptr->ct_prepared;
@@ -57,8 +60,9 @@ __host__ void host_integer_count_of_consecutive_bits(

  // Prepare count of consecutive bits
  //
-  host_integer_prepare_count_of_consecutive_bits(
-      streams, ct_prepared, mem_ptr->prepare_mem, bsks, ksks);
+  host_integer_prepare_count_of_consecutive_bits(streams, ct_prepared,
+                                                 mem_ptr->prepare_mem, bsks,
+                                                 ksks, ms_noise_reduction_key);

  // Perform addition and propagation of prepared cts
  //
@@ -72,11 +76,12 @@ __host__ void host_integer_count_of_consecutive_bits(
  }

  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
-      streams, output_ct, cts, bsks, ksks, mem_ptr->sum_mem, counter_num_blocks,
-      ct_prepared->num_radix_blocks);
+      streams, output_ct, cts, bsks, ksks, ms_noise_reduction_key,
+      mem_ptr->sum_mem, counter_num_blocks, ct_prepared->num_radix_blocks);

  host_propagate_single_carry<Torus>(streams, output_ct, nullptr, nullptr,
-                                     mem_ptr->propagate_mem, bsks, ksks, 0, 0);
+                                     mem_ptr->propagate_mem, bsks, ksks,
+                                     ms_noise_reduction_key, 0, 0);
 }

 template <typename Torus>
@@ -98,14 +103,14 @@ __host__ uint64_t scratch_integer_ilog2(CudaStreams streams,
 }

 template <typename Torus>
-__host__ void
-host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
-                   CudaRadixCiphertextFFI const *input_ct,
-                   CudaRadixCiphertextFFI const *trivial_ct_neg_n,
-                   CudaRadixCiphertextFFI const *trivial_ct_2,
-                   CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block,
-                   int_ilog2_buffer<Torus> *mem_ptr, void *const *bsks,
-                   Torus *const *ksks) {
+__host__ void host_integer_ilog2(
+    CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
+    CudaRadixCiphertextFFI const *input_ct,
+    CudaRadixCiphertextFFI const *trivial_ct_neg_n,
+    CudaRadixCiphertextFFI const *trivial_ct_2,
+    CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block,
+    int_ilog2_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  // Prepare the input ciphertext by computing the number of consecutive
  // leading zeros for each of its blocks.
@@ -113,7 +118,8 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                     mem_ptr->ct_in_buffer, input_ct);
  host_integer_prepare_count_of_consecutive_bits<Torus>(
-      streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks);
+      streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks,
+      ms_noise_reduction_key);

  // Build the input for the sum by taking each block's leading zero count
  // and placing it into a separate, zero-padded ct slot.
@@ -142,17 +148,17 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
  //
  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
      streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
-      ksks, mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
-      mem_ptr->input_num_blocks + 1);
+      ksks, ms_noise_reduction_key, mem_ptr->sum_mem,
+      mem_ptr->counter_num_blocks, mem_ptr->input_num_blocks + 1);

  // Apply luts to the partial sum.
  //
-  host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->message_blocks_not,
-                                      mem_ptr->sum_output_not_propagated,
-                                      mem_ptr->lut_message_not, ksks, bsks);
-  host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->carry_blocks_not,
-                                      mem_ptr->sum_output_not_propagated,
-                                      mem_ptr->lut_carry_not, ksks, bsks);
+  host_apply_univariate_lut_kb<Torus>(
+      streams, mem_ptr->message_blocks_not, mem_ptr->sum_output_not_propagated,
+      mem_ptr->lut_message_not, ksks, ms_noise_reduction_key, bsks);
+  host_apply_univariate_lut_kb<Torus>(
+      streams, mem_ptr->carry_blocks_not, mem_ptr->sum_output_not_propagated,
+      mem_ptr->lut_carry_not, ksks, ms_noise_reduction_key, bsks);

  // Left-shift the bitwise-negated carry blocks by one position.
  //
@@ -190,12 +196,12 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
      trivial_ct_2, 0, mem_ptr->counter_num_blocks);

  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
-      streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks, mem_ptr->sum_mem,
-      mem_ptr->counter_num_blocks, 3);
+      streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks,
+      ms_noise_reduction_key, mem_ptr->sum_mem, mem_ptr->counter_num_blocks, 3);

-  host_full_propagate_inplace<Torus>(streams, output_ct,
-                                     mem_ptr->final_propagate_mem, ksks, bsks,
-                                     mem_ptr->counter_num_blocks);
+  host_full_propagate_inplace<Torus>(
+      streams, output_ct, mem_ptr->final_propagate_mem, ksks,
+      ms_noise_reduction_key, bsks, mem_ptr->counter_num_blocks);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -2,17 +2,18 @@
 #include "integer/negation.cuh"
 #include <linear_algebra.h>

-void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
-                                      CudaRadixCiphertextFFI *input_blocks,
-                                      int8_t *mem_ptr, void *const *ksks,
-                                      void *const *bsks, uint32_t num_blocks) {
+void cuda_full_propagation_64_inplace(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
+    int8_t *mem_ptr, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_blocks) {

  int_fullprop_buffer<uint64_t> *buffer =
      (int_fullprop_buffer<uint64_t> *)mem_ptr;

-  host_full_propagate_inplace<uint64_t>(CudaStreams(streams), input_blocks,
-                                        buffer, (uint64_t **)(ksks), bsks,
-                                        num_blocks);
+  host_full_propagate_inplace<uint64_t>(
+      CudaStreams(streams), input_blocks, buffer, (uint64_t **)(ksks),
+      ms_noise_reduction_key, bsks, num_blocks);
 }

 uint64_t scratch_cuda_full_propagation_64(
@@ -102,24 +103,27 @@ void cuda_propagate_single_carry_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t requested_flag, uint32_t uses_carry) {

  host_propagate_single_carry<uint64_t>(
      CudaStreams(streams), lwe_array, carry_out, carry_in,
      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      requested_flag, uses_carry);
+      ms_noise_reduction_key, requested_flag, uses_carry);
 }

 void cuda_add_and_propagate_single_carry_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry) {

  host_add_and_propagate_single_carry<uint64_t>(
      CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      requested_flag, uses_carry);
+      ms_noise_reduction_key, requested_flag, uses_carry);
 }

 void cuda_integer_overflowing_sub_kb_64_inplace(
@@ -127,13 +131,15 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
    const CudaRadixCiphertextFFI *rhs_array,
    CudaRadixCiphertextFFI *overflow_block,
    const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t compute_overflow,
-    uint32_t uses_input_borrow) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t compute_overflow, uint32_t uses_input_borrow) {
  PUSH_RANGE("overflow sub")
  host_integer_overflowing_sub<uint64_t>(
      CudaStreams(streams), lhs_array, lhs_array, rhs_array, overflow_block,
      input_borrow, (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks, compute_overflow, uses_input_borrow);
+      (uint64_t **)ksks, ms_noise_reduction_key, compute_overflow,
+      uses_input_borrow);
  POP_RANGE()
 }

@@ -212,11 +218,14 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
 void cuda_apply_univariate_lut_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks) {

  host_apply_univariate_lut_kb<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
-      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks);
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
+      ms_noise_reduction_key, bsks);
 }

 void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
@@ -232,13 +241,14 @@ void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
 void cuda_apply_many_univariate_lut_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {

  host_apply_many_univariate_lut_kb<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
-      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
-      num_many_lut, lut_stride);
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
+      ms_noise_reduction_key, bsks, num_many_lut, lut_stride);
 }

 uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
@@ -265,13 +275,15 @@ void cuda_apply_bivariate_lut_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe_1,
    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
-    uint32_t shift) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {

  host_apply_bivariate_lut_kb<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
      input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
-      (uint64_t **)(ksks), bsks, num_radix_blocks, shift);
+      (uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_radix_blocks,
+      shift);
 }

 void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
@@ -308,12 +320,14 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_radix_blocks) {

  host_compute_prefix_sum_hillis_steele<uint64_t>(
      CudaStreams(streams), output_radix_lwe, generates_or_propagates,
      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      num_radix_blocks);
+      ms_noise_reduction_key, num_radix_blocks);
 }

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
@@ -385,12 +399,15 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
 void cuda_apply_noise_squashing_kb(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks) {

  PUSH_RANGE("apply noise squashing")
  integer_radix_apply_noise_squashing_kb<uint64_t>(
      CudaStreams(streams), output_radix_lwe, input_radix_lwe,
-      (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
+      (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
+      ms_noise_reduction_key);
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -507,7 +507,9 @@ template <typename Torus>
 __host__ void integer_radix_apply_univariate_lookup_table_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
-    Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
  PUSH_RANGE("apply lut")
  // apply_lookup_table
  auto params = lut->params;
@@ -545,30 +547,35 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
-        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in.get(), ksks,
-        big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
-        num_radix_blocks);
+        streams.subset_first_gpu(), lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], (Torus *)lwe_array_in->ptr,
+        lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension,
+        ks_base_log, ks_level, num_radix_blocks);

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
    execute_pbs_async<Torus, Torus>(
-        streams.get_ith(0), (Torus *)lwe_array_out->ptr, lut->lwe_indexes_out,
-        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
-        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        streams.subset_first_gpu(), (Torus *)lwe_array_out->ptr,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
+        ms_noise_reduction_key, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
-    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
-        active_streams);
+    cuda_event_record(lut->event_scatter_in, streams.stream(0),
+                      streams.gpu_index(0));
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
+                             streams.gpu_index(j));
+    }

    /// With multiple GPUs we push to the vectors on each GPU then when we
    /// gather data to GPU 0 we can copy back to the original indexing
    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
-        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
        big_lwe_dimension + 1);
    POP_RANGE()
@@ -583,9 +590,10 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
    execute_pbs_async<Torus, Torus>(
        active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
-        lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
-        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
+        lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    PUSH_RANGE("gather")
@@ -594,8 +602,16 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
-    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
-        active_streams);
+    // other gpus record their events
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
+                        streams.gpu_index(j));
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
+                             streams.gpu_index(0));
+    }
  }
  for (uint i = 0; i < num_radix_blocks; i++) {
    auto degrees_index = lut->h_lut_indexes[i];
@@ -611,8 +627,9 @@ template <typename Torus>
 __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
-    Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
  PUSH_RANGE("apply many lut")
  // apply_lookup_table
  auto params = lut->params;
@@ -647,30 +664,34 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
-        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (Torus *)lwe_array_in->ptr, lut->lwe_indexes_in.get(), ksks,
-        big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
-        num_radix_blocks);
+        streams.subset_first_gpu(), lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], (Torus *)lwe_array_in->ptr,
+        lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension,
+        ks_base_log, ks_level, num_radix_blocks);

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
    execute_pbs_async<Torus, Torus>(
-        streams.get_ith(0), (Torus *)lwe_array_out->ptr, lut->lwe_indexes_out,
-        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
-        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        streams.subset_first_gpu(), (Torus *)lwe_array_out->ptr,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
+        ms_noise_reduction_key, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
-    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
-        active_streams);
-
+    cuda_event_record(lut->event_scatter_in, streams.stream(0),
+                      streams.gpu_index(0));
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
+                             streams.gpu_index(j));
+    }
    /// With multiple GPUs we push to the vectors on each GPU then when we
    /// gather data to GPU 0 we can copy back to the original indexing
    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
-        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
        big_lwe_dimension + 1);
    POP_RANGE()
@@ -685,9 +706,10 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
    execute_pbs_async<Torus, Torus>(
        active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
-        lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
-        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
+        lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    PUSH_RANGE("gather")
@@ -697,8 +719,16 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
        num_radix_blocks, big_lwe_dimension + 1, num_many_lut);
    POP_RANGE()

-    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
-        active_streams);
+    // other gpus record their events
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
+                        streams.gpu_index(j));
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
+                             streams.gpu_index(0));
+    }
  }
  for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
    auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
@@ -715,8 +745,9 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
-    Torus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks,
-    uint32_t shift) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int_radix_lut<Torus> *lut, uint32_t num_radix_blocks, uint32_t shift) {
  PUSH_RANGE("apply bivar lut")
  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -748,10 +779,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  uint32_t lut_stride = 0;

  // Left message is shifted
-  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks.get();
+  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
  host_pack_bivariate_blocks<Torus>(
      streams, lwe_array_pbs_in, lut->lwe_trivial_indexes, lwe_array_1,
-      lwe_array_2, lut->lwe_indexes_in.get(), shift, num_radix_blocks,
+      lwe_array_2, lut->lwe_indexes_in, shift, num_radix_blocks,
      params.message_modulus, params.carry_modulus);
  check_cuda_error(cudaGetLastError());

@@ -765,27 +796,31 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
-        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in.get(), ksks,
-        big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
-        num_radix_blocks);
+        streams.subset_first_gpu(), lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], (Torus *)lwe_array_pbs_in->ptr,
+        lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension,
+        ks_base_log, ks_level, num_radix_blocks);

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
    execute_pbs_async<Torus, Torus>(
-        streams.get_ith(0), (Torus *)(lwe_array_out->ptr), lut->lwe_indexes_out,
-        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
-        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        streams.subset_first_gpu(), (Torus *)(lwe_array_out->ptr),
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
+        ms_noise_reduction_key, lut->buffer, glwe_dimension,
        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
  } else {
-    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
-        active_streams);
-
+    cuda_event_record(lut->event_scatter_in, streams.stream(0),
+                      streams.gpu_index(0));
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
+                             streams.gpu_index(j));
+    }
    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
        active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
-        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
        lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
        big_lwe_dimension + 1);
    POP_RANGE()
@@ -800,9 +835,10 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
    execute_pbs_async<Torus, Torus>(
        active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
-        lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
-        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
+        lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
+        lut_stride);

    /// Copy data back to GPU 0 and release vecs
    PUSH_RANGE("gather")
@@ -811,8 +847,16 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
        lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
    POP_RANGE()
-    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
-        active_streams);
+    // other gpus record their events
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
+                        streams.gpu_index(j));
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
+                             streams.gpu_index(0));
+    }
  }
  for (uint i = 0; i < num_radix_blocks; i++) {
    auto degrees_index = lut->h_lut_indexes[i];
@@ -1273,7 +1317,9 @@ template <typename Torus>
 void host_compute_shifted_blocks_and_states(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
    int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t lut_stride, uint32_t num_many_lut) {

  auto num_radix_blocks = lwe_array->num_radix_blocks;

@@ -1282,7 +1328,7 @@ void host_compute_shifted_blocks_and_states(

  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
      streams, shifted_blocks_and_states, lwe_array, bsks, ksks,
-      luts_array_first_step, num_many_lut, lut_stride);
+      ms_noise_reduction_key, luts_array_first_step, num_many_lut, lut_stride);

  auto shifted_blocks = mem->shifted_blocks;
  auto block_states = mem->block_states;
@@ -1301,7 +1347,9 @@ void host_resolve_group_carries_sequentially(
    CudaStreams streams, CudaRadixCiphertextFFI *resolved_carries,
    CudaRadixCiphertextFFI *grouping_pgns, int_radix_params params,
    int_seq_group_prop_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t num_groups) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_groups) {

  auto group_resolved_carries = mem->group_resolved_carries;
  if (num_groups > 1) {
@@ -1350,8 +1398,8 @@ void host_resolve_group_carries_sequentially(
                                       blocks_to_solve + 1);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, &shifted_group_resolved_carries,
-          &shifted_group_resolved_carries, bsks, ksks, luts_sequential,
-          blocks_to_solve);
+          &shifted_group_resolved_carries, bsks, ksks, ms_noise_reduction_key,
+          luts_sequential, blocks_to_solve);

      // Copy the result to the resolved carries array
      copy_radix_ciphertext_slice_async<Torus>(
@@ -1368,7 +1416,9 @@ template <typename Torus>
 void host_compute_prefix_sum_hillis_steele(
    CudaStreams streams, CudaRadixCiphertextFFI *step_output,
    CudaRadixCiphertextFFI *generates_or_propagates, int_radix_lut<Torus> *luts,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {
+    void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (step_output->lwe_dimension != generates_or_propagates->lwe_dimension)
    PANIC("Cuda error: input lwe dimensions must be the same")
@@ -1390,8 +1440,9 @@ void host_compute_prefix_sum_hillis_steele(
    int cur_total_blocks = num_radix_blocks - space;

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks, luts,
-        cur_total_blocks, luts->params.message_modulus);
+        streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks,
+        ms_noise_reduction_key, luts, cur_total_blocks,
+        luts->params.message_modulus);

    copy_radix_ciphertext_slice_async<Torus>(
        streams.stream(0), streams.gpu_index(0), generates_or_propagates, space,
@@ -1411,8 +1462,9 @@ template <typename Torus>
 void host_compute_propagation_simulators_and_group_carries(
    CudaStreams streams, CudaRadixCiphertextFFI *block_states,
    int_radix_params params, int_prop_simu_group_carries_memory<Torus> *mem,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
-    uint32_t num_groups) {
+    void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks, uint32_t num_groups) {

  if (num_radix_blocks > block_states->num_radix_blocks)
    PANIC("Cuda error: input does not have enough radix blocks")
@@ -1429,7 +1481,7 @@ void host_compute_propagation_simulators_and_group_carries(
  auto luts_array_second_step = mem->luts_array_second_step;
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, propagation_cum_sums, propagation_cum_sums, bsks, ksks,
-      luts_array_second_step, num_radix_blocks);
+      ms_noise_reduction_key, luts_array_second_step, num_radix_blocks);

  host_integer_radix_scalar_addition_inplace<Torus>(
      streams, propagation_cum_sums, mem->scalar_array_cum_sum,
@@ -1448,9 +1500,10 @@ void host_compute_propagation_simulators_and_group_carries(
  auto resolved_carries = mem->resolved_carries;
  if (mem->use_sequential_algorithm_to_resolve_group_carries) {
    // Resolve group carries sequentially
-    host_resolve_group_carries_sequentially(
-        streams, resolved_carries, grouping_pgns, params,
-        mem->seq_group_prop_mem, bsks, ksks, num_groups);
+    host_resolve_group_carries_sequentially(streams, resolved_carries,
+                                            grouping_pgns, params,
+                                            mem->seq_group_prop_mem, bsks, ksks,
+                                            ms_noise_reduction_key, num_groups);
  } else {
    // Resolve group carries with hillis steele
    auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
@@ -1459,7 +1512,8 @@ void host_compute_propagation_simulators_and_group_carries(
                                     resolved_carries, 1, num_groups);
    host_compute_prefix_sum_hillis_steele<Torus>(
        streams, &shifted_resolved_carries, grouping_pgns,
-        luts_carry_propagation_sum, bsks, ksks, num_groups - 1);
+        luts_carry_propagation_sum, bsks, ksks, ms_noise_reduction_key,
+        num_groups - 1);
  }
 }

@@ -1473,7 +1527,9 @@ template <typename Torus>
 void host_compute_shifted_blocks_and_borrow_states(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
    int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t lut_stride, uint32_t num_many_lut) {
  auto num_radix_blocks = lwe_array->num_radix_blocks;

  auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
@@ -1481,7 +1537,7 @@ void host_compute_shifted_blocks_and_borrow_states(

  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
      streams, shifted_blocks_and_borrow_states, lwe_array, bsks, ksks,
-      luts_array_first_step, num_many_lut, lut_stride);
+      ms_noise_reduction_key, luts_array_first_step, num_many_lut, lut_stride);

  auto shifted_blocks = mem->shifted_blocks;
  auto borrow_states = mem->borrow_states;
@@ -1503,11 +1559,11 @@ void host_compute_shifted_blocks_and_borrow_states(
 * have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
 */
 template <typename Torus>
-void host_full_propagate_inplace(CudaStreams streams,
-                                 CudaRadixCiphertextFFI *input_blocks,
-                                 int_fullprop_buffer<Torus> *mem_ptr,
-                                 Torus *const *ksks, void *const *bsks,
-                                 uint32_t num_blocks) {
+void host_full_propagate_inplace(
+    CudaStreams streams, CudaRadixCiphertextFFI *input_blocks,
+    int_fullprop_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_blocks) {
  auto params = mem_ptr->lut->params;

  // In the case of extracting a single LWE this parameters are dummy
@@ -1519,7 +1575,8 @@ void host_full_propagate_inplace(CudaStreams streams,

    /// Since the keyswitch is done on one input only, use only 1 GPU
    execute_keyswitch_async<Torus>(
-        streams.get_ith(0), (Torus *)(mem_ptr->tmp_small_lwe_vector->ptr),
+        streams.subset_first_gpu(),
+        (Torus *)(mem_ptr->tmp_small_lwe_vector->ptr),
        mem_ptr->lut->lwe_trivial_indexes, (Torus *)cur_input_block.ptr,
        mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
        params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
@@ -1529,12 +1586,12 @@ void host_full_propagate_inplace(CudaStreams streams,
        1, 2, mem_ptr->tmp_small_lwe_vector, 0, 1);

    execute_pbs_async<Torus, Torus>(
-        streams.get_ith(0), (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
+        streams.subset_first_gpu(), (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
        mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
        mem_ptr->lut->lut_indexes_vec,
        (Torus *)mem_ptr->tmp_small_lwe_vector->ptr,
-        mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
-        params.glwe_dimension, params.small_lwe_dimension,
+        mem_ptr->lut->lwe_trivial_indexes, bsks, ms_noise_reduction_key,
+        mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension,
        params.polynomial_size, params.pbs_base_log, params.pbs_level,
        params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride);

@@ -1665,12 +1722,13 @@ __host__ void scalar_pack_blocks(cudaStream_t stream, uint32_t gpu_index,
 * * (lwe_dimension+1) * sizeeof(Torus) bytes
 */
 template <typename Torus>
-__host__ void
-extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
-               const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
-               Torus *const *ksks, uint32_t effective_num_radix_blocks,
-               uint32_t num_radix_blocks,
-               int_bit_extract_luts_buffer<Torus> *bit_extract) {
+__host__ void extract_n_bits(
+    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
+    const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t effective_num_radix_blocks, uint32_t num_radix_blocks,
+    int_bit_extract_luts_buffer<Torus> *bit_extract) {

  copy_radix_ciphertext_slice_async<Torus>(
      streams.stream(0), streams.gpu_index(0), lwe_array_out, 0,
@@ -1684,17 +1742,19 @@ extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    }
  }
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
-      effective_num_radix_blocks);
+      streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
+      bit_extract->lut, effective_num_radix_blocks);
 }

 template <typename Torus>
-__host__ void
-reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
-             CudaRadixCiphertextFFI *signs_array_in,
-             int_comparison_buffer<Torus> *mem_ptr,
-             std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
-             Torus *const *ksks, uint32_t num_sign_blocks) {
+__host__ void reduce_signs(
+    CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
+    CudaRadixCiphertextFFI *signs_array_in,
+    int_comparison_buffer<Torus> *mem_ptr,
+    std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_sign_blocks) {

  if (signs_array_out->lwe_dimension != signs_array_in->lwe_dimension)
    PANIC("Cuda error: input lwe dimensions must be the same")
@@ -1740,7 +1800,8 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
      pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
                         signs_a, num_sign_blocks, message_modulus);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, signs_a, signs_b, bsks, ksks, lut, num_sign_blocks / 2);
+          streams, signs_a, signs_b, bsks, ksks, ms_noise_reduction_key, lut,
+          num_sign_blocks / 2);

      if (num_sign_blocks % 2 == 1)
        copy_radix_ciphertext_slice_async<Torus>(
@@ -1770,7 +1831,8 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
                       signs_a, num_sign_blocks, message_modulus);
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, signs_array_out, signs_b, bsks, ksks, lut, 1);
+        streams, signs_array_out, signs_b, bsks, ksks, ms_noise_reduction_key,
+        lut, 1);

  } else {

@@ -1788,7 +1850,8 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
    lut->broadcast_lut(lut->active_streams);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
+        streams, signs_array_out, signs_a, bsks, ksks, ms_noise_reduction_key,
+        lut, 1);
  }
 }

@@ -1815,15 +1878,16 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
 }

 template <typename Torus>
-void host_apply_univariate_lut_kb(CudaStreams streams,
-                                  CudaRadixCiphertextFFI *radix_lwe_out,
-                                  CudaRadixCiphertextFFI const *radix_lwe_in,
-                                  int_radix_lut<Torus> *mem, Torus *const *ksks,
-                                  void *const *bsks) {
+void host_apply_univariate_lut_kb(
+    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks) {

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem,
-      radix_lwe_out->num_radix_blocks);
+      streams, radix_lwe_out, radix_lwe_in, bsks, ksks, ms_noise_reduction_key,
+      mem, radix_lwe_out->num_radix_blocks);
 }

 template <typename Torus>
@@ -1853,12 +1917,13 @@ template <typename Torus>
 void host_apply_many_univariate_lut_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
-    Torus *const *ksks, void *const *bsks, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {

  integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
-      streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem, num_many_lut,
-      lut_stride);
+      streams, radix_lwe_out, radix_lwe_in, bsks, ksks, ms_noise_reduction_key,
+      mem, num_many_lut, lut_stride);
 }

 template <typename Torus>
@@ -1884,17 +1949,17 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
 }

 template <typename Torus>
-void host_apply_bivariate_lut_kb(CudaStreams streams,
-                                 CudaRadixCiphertextFFI *radix_lwe_out,
-                                 CudaRadixCiphertextFFI const *radix_lwe_in_1,
-                                 CudaRadixCiphertextFFI const *radix_lwe_in_2,
-                                 int_radix_lut<Torus> *mem, Torus *const *ksks,
-                                 void *const *bsks, uint32_t num_radix_blocks,
-                                 uint32_t shift) {
+void host_apply_bivariate_lut_kb(
+    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
+    CudaRadixCiphertextFFI const *radix_lwe_in_1,
+    CudaRadixCiphertextFFI const *radix_lwe_in_2, int_radix_lut<Torus> *mem,
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks, mem,
-      num_radix_blocks, shift);
+      streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks,
+      ms_noise_reduction_key, mem, num_radix_blocks, shift);
 }

 template <typename Torus>
@@ -1913,13 +1978,13 @@ uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
 // This function perform the three steps of Thomas' new carry propagation
 // includes the logic to extract overflow when requested
 template <typename Torus>
-void host_propagate_single_carry(CudaStreams streams,
-                                 CudaRadixCiphertextFFI *lwe_array,
-                                 CudaRadixCiphertextFFI *carry_out,
-                                 const CudaRadixCiphertextFFI *input_carries,
-                                 int_sc_prop_memory<Torus> *mem,
-                                 void *const *bsks, Torus *const *ksks,
-                                 uint32_t requested_flag, uint32_t uses_carry) {
+void host_propagate_single_carry(
+    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
+    CudaRadixCiphertextFFI *carry_out,
+    const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
+    void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry) {
  PUSH_RANGE("propagate sc")
  auto num_radix_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
@@ -1942,8 +2007,8 @@ void host_propagate_single_carry(CudaStreams streams,

  // Step 1
  host_compute_shifted_blocks_and_states<Torus>(
-      streams, lwe_array, mem->shifted_blocks_state_mem, bsks, ksks, lut_stride,
-      num_many_lut);
+      streams, lwe_array, mem->shifted_blocks_state_mem, bsks, ksks,
+      ms_noise_reduction_key, lut_stride, num_many_lut);
  auto block_states = mem->shifted_blocks_state_mem->block_states;

  if (requested_flag == outputFlag::FLAG_CARRY) {
@@ -1954,7 +2019,7 @@ void host_propagate_single_carry(CudaStreams streams,
  // Step 2
  host_compute_propagation_simulators_and_group_carries<Torus>(
      streams, block_states, params, mem->prop_simu_group_carries_mem, bsks,
-      ksks, num_radix_blocks, mem->num_groups);
+      ksks, ms_noise_reduction_key, num_radix_blocks, mem->num_groups);

  auto group_size = mem->prop_simu_group_carries_mem->group_size;

@@ -1995,7 +2060,7 @@ void host_propagate_single_carry(CudaStreams streams,
        num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, mem->output_flag, prepared_blocks, bsks, ksks,
-        mem->lut_message_extract, num_radix_blocks + 1);
+        ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks + 1);

    copy_radix_ciphertext_slice_async<Torus>(
        streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_radix_blocks,
@@ -2006,8 +2071,8 @@ void host_propagate_single_carry(CudaStreams streams,
  } else {
    auto message_extract = mem->lut_message_extract;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, lwe_array, prepared_blocks, bsks, ksks, message_extract,
-        num_radix_blocks);
+        streams, lwe_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key,
+        message_extract, num_radix_blocks);
  }
  POP_RANGE()
 }
@@ -2019,8 +2084,9 @@ void host_add_and_propagate_single_carry(
    CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
-    void *const *bsks, Torus *const *ksks, uint32_t requested_flag,
-    uint32_t uses_carry) {
+    void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry) {
  PUSH_RANGE("add & propagate sc")
  if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -2073,14 +2139,15 @@ void host_add_and_propagate_single_carry(
  }
  // Step 1
  host_compute_shifted_blocks_and_states<Torus>(
-      streams, lhs_array, mem->shifted_blocks_state_mem, bsks, ksks, lut_stride,
-      num_many_lut);
+      streams, lhs_array, mem->shifted_blocks_state_mem, bsks, ksks,
+      ms_noise_reduction_key, lut_stride, num_many_lut);
  auto block_states = mem->shifted_blocks_state_mem->block_states;
  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
    auto lut_overflow_prep = mem->lut_overflow_flag_prep;
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, &output_flag, mem->last_lhs, mem->last_rhs, bsks, ksks,
-        lut_overflow_prep, 1, lut_overflow_prep->params.message_modulus);
+        ms_noise_reduction_key, lut_overflow_prep, 1,
+        lut_overflow_prep->params.message_modulus);
  } else if (requested_flag == outputFlag::FLAG_CARRY) {
    copy_radix_ciphertext_slice_async<Torus>(
        streams.stream(0), streams.gpu_index(0), &output_flag, 0, 1,
@@ -2090,7 +2157,7 @@ void host_add_and_propagate_single_carry(
  // Step 2
  host_compute_propagation_simulators_and_group_carries<Torus>(
      streams, block_states, params, mem->prop_simu_group_carries_mem, bsks,
-      ksks, num_radix_blocks, mem->num_groups);
+      ksks, ms_noise_reduction_key, num_radix_blocks, mem->num_groups);

  auto group_size = mem->prop_simu_group_carries_mem->group_size;

@@ -2143,7 +2210,7 @@ void host_add_and_propagate_single_carry(
        num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, mem->output_flag, prepared_blocks, bsks, ksks,
-        mem->lut_message_extract, num_radix_blocks + 1);
+        ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks + 1);

    copy_radix_ciphertext_slice_async<Torus>(
        streams.stream(0), streams.gpu_index(0), lhs_array, 0, num_radix_blocks,
@@ -2154,7 +2221,7 @@ void host_add_and_propagate_single_carry(
        mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
  } else {
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, lhs_array, prepared_blocks, bsks, ksks,
+        streams, lhs_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key,
        mem->lut_message_extract, num_radix_blocks);
  }
  POP_RANGE()
@@ -2177,15 +2244,14 @@ uint64_t scratch_cuda_integer_overflowing_sub(
 // This function perform the three steps of Thomas' new borrow propagation
 // includes the logic to extract overflow when requested
 template <typename Torus>
-void host_single_borrow_propagate(CudaStreams streams,
-                                  CudaRadixCiphertextFFI *lwe_array,
-                                  CudaRadixCiphertextFFI *overflow_block,
-                                  const CudaRadixCiphertextFFI *input_borrow,
-                                  int_borrow_prop_memory<Torus> *mem,
-                                  void *const *bsks, Torus *const *ksks,
-                                  uint32_t num_groups,
-                                  uint32_t compute_overflow,
-                                  uint32_t uses_input_borrow) {
+void host_single_borrow_propagate(
+    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
+    CudaRadixCiphertextFFI *overflow_block,
+    const CudaRadixCiphertextFFI *input_borrow,
+    int_borrow_prop_memory<Torus> *mem, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_groups, uint32_t compute_overflow,
+    uint32_t uses_input_borrow) {
  auto num_radix_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
@@ -2207,7 +2273,7 @@ void host_single_borrow_propagate(CudaStreams streams,
  // Step 1
  host_compute_shifted_blocks_and_borrow_states<Torus>(
      streams, lwe_array, mem->shifted_blocks_borrow_state_mem, bsks, ksks,
-      lut_stride, num_many_lut);
+      ms_noise_reduction_key, lut_stride, num_many_lut);

  auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
  copy_radix_ciphertext_slice_async<Torus>(
@@ -2217,7 +2283,7 @@ void host_single_borrow_propagate(CudaStreams streams,
  // Step 2
  host_compute_propagation_simulators_and_group_carries<Torus>(
      streams, borrow_states, params, mem->prop_simu_group_carries_mem, bsks,
-      ksks, num_radix_blocks, num_groups);
+      ksks, ms_noise_reduction_key, num_radix_blocks, num_groups);

  auto shifted_blocks =
      (Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
@@ -2256,53 +2322,27 @@ void host_single_borrow_propagate(CudaStreams streams,
                         params.carry_modulus);
  }

-  cuda_event_record(mem->incoming_events[0], streams.stream(0),
-                    streams.gpu_index(0));
-  for (int j = 0; j < mem->active_streams.count(); j++) {
-    cuda_stream_wait_event(mem->sub_streams_1.stream(j),
-                           mem->incoming_events[0],
-                           mem->sub_streams_1.gpu_index(j));
-    cuda_stream_wait_event(mem->sub_streams_2.stream(j),
-                           mem->incoming_events[0],
-                           mem->sub_streams_1.gpu_index(j));
-  }
-
  if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
    auto borrow_flag = mem->lut_borrow_flag;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem->sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
-        borrow_flag, 1);
-  }
-  for (int j = 0; j < mem->active_streams.count(); j++) {
-    cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1.stream(j),
-                      mem->sub_streams_1.gpu_index(j));
+        streams, overflow_block, mem->overflow_block, bsks, ksks,
+        ms_noise_reduction_key, borrow_flag, 1);
  }

  // subtract borrow and cleanup prepared blocks
  auto resolved_carries = mem->prop_simu_group_carries_mem->resolved_carries;
  host_negation<Torus>(
-      mem->sub_streams_2.stream(0), mem->sub_streams_2.gpu_index(0),
-      (Torus *)resolved_carries->ptr, (Torus *)resolved_carries->ptr,
-      big_lwe_dimension, num_groups);
+      streams.stream(0), streams.gpu_index(0), (Torus *)resolved_carries->ptr,
+      (Torus *)resolved_carries->ptr, big_lwe_dimension, num_groups);

  host_radix_sum_in_groups<Torus>(
-      mem->sub_streams_2.stream(0), mem->sub_streams_2.gpu_index(0),
-      prepared_blocks, prepared_blocks, resolved_carries, num_radix_blocks,
-      mem->group_size);
+      streams.stream(0), streams.gpu_index(0), prepared_blocks, prepared_blocks,
+      resolved_carries, num_radix_blocks, mem->group_size);

  auto message_extract = mem->lut_message_extract;
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      mem->sub_streams_2, lwe_array, prepared_blocks, bsks, ksks,
+      streams, lwe_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key,
      message_extract, num_radix_blocks);
-
-  for (int j = 0; j < mem->active_streams.count(); j++) {
-    cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2.stream(j),
-                      mem->sub_streams_2.gpu_index(j));
-    cuda_stream_wait_event(streams.stream(0), mem->outgoing_events1[j],
-                           streams.gpu_index(0));
-    cuda_stream_wait_event(streams.stream(0), mem->outgoing_events2[j],
-                           streams.gpu_index(0));
-  }
 }

 /// num_radix_blocks corresponds to the number of blocks on which to apply the
@@ -2313,7 +2353,8 @@ __host__ void integer_radix_apply_noise_squashing_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in,
    int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
-    InputTorus *const *ksks) {
+    InputTorus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  PUSH_RANGE("apply noise squashing")
  auto params = lut->params;
@@ -2334,7 +2375,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(

  /// For multi GPU execution we create vectors of pointers for inputs and
  /// outputs
-  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks.get();
+  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
  std::vector<InputTorus *> lwe_array_in_vec = lut->lwe_array_in_vec;
  std::vector<InputTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
  std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
@@ -2352,10 +2393,11 @@ __host__ void integer_radix_apply_noise_squashing_kb(
      streams.active_gpu_subset(lwe_array_out->num_radix_blocks);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<InputTorus>(
-        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
-        (InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in.get(), ksks,
-        lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
-        ks_level, lwe_array_out->num_radix_blocks);
+        streams.subset_first_gpu(), lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], (InputTorus *)lwe_array_pbs_in->ptr,
+        lut->lwe_indexes_in, ksks, lut->input_big_lwe_dimension,
+        small_lwe_dimension, ks_base_log, ks_level,
+        lwe_array_out->num_radix_blocks);

    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
    /// dimension to a big LWE dimension
@@ -2363,12 +2405,13 @@ __host__ void integer_radix_apply_noise_squashing_kb(
    /// int_noise_squashing_lut doesn't support a different output or lut
    /// indexing than the trivial
    execute_pbs_async<uint64_t, __uint128_t>(
-        streams.get_ith(0), (__uint128_t *)lwe_array_out->ptr,
+        streams.subset_first_gpu(), (__uint128_t *)lwe_array_out->ptr,
        lwe_trivial_indexes_vec[0], lut->lut_vec, lwe_trivial_indexes_vec,
-        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer,
-        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, lwe_array_out->num_radix_blocks,
-        params.pbs_type, 0, 0);
+        lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
+        ms_noise_reduction_key, lut->pbs_buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
+        0);
  } else {
    /// Make sure all data that should be on GPU 0 is indeed there
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
@@ -2377,7 +2420,7 @@ __host__ void integer_radix_apply_noise_squashing_kb(
    /// gather data to GPU 0 we can copy back to the original indexing
    multi_gpu_scatter_lwe_async<InputTorus>(
        active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr,
-        lut->lwe_indexes_in.get(), lut->using_trivial_lwe_indexes,
+        lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
        lut->lwe_aligned_scatter_vec, lut->active_streams.count(),
        lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);

@@ -2392,10 +2435,10 @@ __host__ void integer_radix_apply_noise_squashing_kb(
    execute_pbs_async<uint64_t, __uint128_t>(
        active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
        lut->lut_vec, lwe_trivial_indexes_vec, lwe_after_ks_vec,
-        lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
-        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
-        0);
+        lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, lwe_array_out->num_radix_blocks,
+        params.pbs_type, 0, 0);

    /// Copy data back to GPU 0 and release vecs
    /// In apply noise squashing we always use trivial indexes
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -128,51 +128,59 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
-    void *const *bsks, void *const *ksks, int8_t *mem_ptr,
-    uint32_t polynomial_size, uint32_t num_blocks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
  PUSH_RANGE("mul")
  switch (polynomial_size) {
  case 256:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
    break;
  case 512:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
    break;
  case 1024:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
    break;
  case 2048:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
    break;
  case 4096:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
    break;
  case 8192:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
    break;
  case 16384:
    host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
        CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
        radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
-        (int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
+        ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
+        num_blocks);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -217,7 +225,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
@@ -225,7 +234,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
          "output's number of radix blocks")
  host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
      CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
-      (uint64_t **)(ksks), mem, radix_lwe_out->num_radix_blocks,
+      (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+      radix_lwe_out->num_radix_blocks,
      radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -291,6 +291,7 @@ template <typename Torus>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
    uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
@@ -375,7 +376,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

  while (needs_processing) {
    auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in.get();
+    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
    auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
    calculate_chunks<Torus>
        <<<number_of_blocks_2d, number_of_threads, 0, streams.stream(0)>>>(
@@ -397,15 +398,17 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

    if (active_streams.count() == 1) {
      execute_keyswitch_async<Torus>(
-          streams.get_ith(0), (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in,
-          (Torus *)current_blocks->ptr, d_pbs_indexes_in, ksks,
-          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
-          mem_ptr->params.ks_level, total_messages);
+          streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
+          ksks, big_lwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
+          total_messages);

      execute_pbs_async<Torus, Torus>(
-          streams.get_ith(0), (Torus *)current_blocks->ptr, d_pbs_indexes_out,
-          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
-          (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
+          streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
+          d_pbs_indexes_out, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
          polynomial_size, mem_ptr->params.pbs_base_log,
          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
@@ -419,7 +422,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, current_blocks, current_blocks, bsks, ksks,
-          luts_message_carry, total_ciphertexts);
+          ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
    }
    cuda_set_device(streams.gpu_index(0));
    std::swap(d_columns, d_new_columns);
@@ -433,7 +436,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

  if (mem_ptr->reduce_degrees_for_single_carry_propagation) {
    auto luts_message_carry = mem_ptr->luts_message_carry;
-    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in.get();
+    auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
    auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
    prepare_final_pbs_indexes<Torus>
        <<<1, 2 * num_radix_blocks, 0, streams.stream(0)>>>(
@@ -448,15 +451,16 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

    if (active_streams.count() == 1) {
      execute_keyswitch_async<Torus>(
-          streams.get_ith(0), (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in,
-          (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
+          streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
          mem_ptr->params.ks_level, num_radix_blocks);

      execute_pbs_async<Torus, Torus>(
-          streams.get_ith(0), (Torus *)current_blocks->ptr, d_pbs_indexes_out,
-          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
-          (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
+          streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
+          d_pbs_indexes_out, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
          polynomial_size, mem_ptr->params.pbs_base_log,
          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
@@ -470,7 +474,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          active_streams, current_blocks, radix_lwe_out, bsks, ksks,
-          luts_message_carry, num_blocks_in_apply_lut);
+          ms_noise_reduction_key, luts_message_carry, num_blocks_in_apply_lut);
    }
    calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
                            num_radix_blocks, num_radix_in_vec, chunk_size,
@@ -492,8 +496,9 @@ __host__ void host_integer_mult_radix_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
-    void *const *bsks, uint64_t *const *ksks, int_mul_memory<Torus> *mem_ptr,
-    uint32_t num_blocks) {
+    void *const *bsks, uint64_t *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {

  if (radix_lwe_out->lwe_dimension != radix_lwe_left->lwe_dimension ||
      radix_lwe_right->lwe_dimension != radix_lwe_left->lwe_dimension)
@@ -511,14 +516,14 @@ __host__ void host_integer_mult_radix_kb(
  if (is_bool_right) {
    zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_left, radix_lwe_right,
                       mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
-                       bsks, ksks, num_blocks);
+                       bsks, ksks, ms_noise_reduction_key, num_blocks);
    return;
  }

  if (is_bool_left) {
    zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_right, radix_lwe_left,
                       mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
-                       bsks, ksks, num_blocks);
+                       bsks, ksks, ms_noise_reduction_key, num_blocks);
    return;
  }

@@ -587,7 +592,8 @@ __host__ void host_integer_mult_radix_kb(

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
-      luts_array, total_block_count, luts_array->params.message_modulus);
+      ms_noise_reduction_key, luts_array, total_block_count,
+      luts_array->params.message_modulus);

  vector_result_lsb = block_mul_res;
  as_radix_ciphertext_slice<Torus>(&vector_result_msb, block_mul_res,
@@ -615,14 +621,15 @@ __host__ void host_integer_mult_radix_kb(
  }
  host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
      streams, radix_lwe_out, vector_result_sb, bsks, ksks,
-      mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
+      ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
+      2 * num_blocks);

  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
-  host_propagate_single_carry<Torus>(streams, radix_lwe_out, nullptr, nullptr,
-                                     scp_mem_ptr, bsks, ksks, requested_flag,
-                                     uses_carry);
+  host_propagate_single_carry<Torus>(
+      streams, radix_lwe_out, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
+      ms_noise_reduction_key, requested_flag, uses_carry);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -134,7 +134,9 @@ __host__ void host_integer_overflowing_sub(
    CudaRadixCiphertextFFI *overflow_block,
    const CudaRadixCiphertextFFI *input_borrow,
    int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t compute_overflow, uint32_t uses_input_borrow) {
  PUSH_RANGE("overflowing sub")
  if (output->num_radix_blocks != input_left->num_radix_blocks ||
      output->num_radix_blocks != input_right->num_radix_blocks)
@@ -164,7 +166,7 @@ __host__ void host_integer_overflowing_sub(
  host_single_borrow_propagate<Torus>(
      streams, output, overflow_block, input_borrow,
      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
-      num_groups, compute_overflow, uses_input_borrow);
+      ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cu
@@ -5,9 +5,10 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
    uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks_to_process,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, uint32_t message_bits_per_block,
-    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    uint32_t message_bits_per_block, uint32_t total_random_bits,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -17,19 +18,20 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(

  return scratch_cuda_integer_grouped_oprf<uint64_t>(
      CudaStreams(streams), (int_grouped_oprf_memory<uint64_t> **)mem_ptr,
-      params, num_blocks_to_process, message_bits_per_block, total_random_bits,
-      allocate_gpu_memory);
+      params, num_blocks_to_process, num_blocks, message_bits_per_block,
+      total_random_bits, allocate_gpu_memory);
 }

-void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
-                                        CudaRadixCiphertextFFI *radix_lwe_out,
-                                        const void *seeded_lwe_input,
-                                        uint32_t num_blocks_to_process,
-                                        int8_t *mem, void *const *bsks) {
+void cuda_integer_grouped_oprf_async_64(
+    CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
+    const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
+    void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_grouped_oprf<uint64_t>(
      CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
-      num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks);
+      num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks,
+      ms_noise_reduction_key);
 }

 void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -8,34 +8,34 @@ template <typename Torus>
 uint64_t scratch_cuda_integer_grouped_oprf(
    CudaStreams streams, int_grouped_oprf_memory<Torus> **mem_ptr,
    int_radix_params params, uint32_t num_blocks_to_process,
-    uint32_t message_bits_per_block, uint64_t total_random_bits,
-    bool allocate_gpu_memory) {
+    uint32_t num_blocks, uint32_t message_bits_per_block,
+    uint64_t total_random_bits, bool allocate_gpu_memory) {
  uint64_t size_tracker = 0;

  *mem_ptr = new int_grouped_oprf_memory<Torus>(
-      streams, params, num_blocks_to_process, message_bits_per_block,
-      total_random_bits, allocate_gpu_memory, size_tracker);
+      streams, params, num_blocks_to_process, num_blocks,
+      message_bits_per_block, total_random_bits, allocate_gpu_memory,
+      size_tracker);

  return size_tracker;
 }

 template <typename Torus>
-void host_integer_grouped_oprf(CudaStreams streams,
-                               CudaRadixCiphertextFFI *radix_lwe_out,
-                               const Torus *seeded_lwe_input,
-                               uint32_t num_blocks_to_process,
-                               int_grouped_oprf_memory<Torus> *mem_ptr,
-                               void *const *bsks) {
+void host_integer_grouped_oprf(
+    CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
+    const Torus *seeded_lwe_input, uint32_t num_blocks_to_process,
+    int_grouped_oprf_memory<Torus> *mem_ptr, void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
  auto lut = mem_ptr->luts;

  if (active_streams.count() == 1) {
    execute_pbs_async<Torus, Torus>(
-        streams.get_ith(0), (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
+        streams, (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
        lut->lut_vec, lut->lut_indexes_vec,
-        const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in.get(), bsks,
-        lut->buffer, mem_ptr->params.glwe_dimension,
+        const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
+        ms_noise_reduction_key, lut->buffer, mem_ptr->params.glwe_dimension,
        mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
        mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
        mem_ptr->params.grouping_factor, num_blocks_to_process,
@@ -45,35 +45,48 @@ void host_integer_grouped_oprf(CudaStreams streams,
    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-    lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
-        active_streams);
+    cuda_event_record(lut->event_scatter_in, streams.stream(0),
+                      streams.gpu_index(0));
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
+                             streams.gpu_index(j));
+    }
+
+    if (!lut->using_trivial_lwe_indexes) {
+      PANIC("lut->using_trivial_lwe_indexes should be true");
+    }

-    PUSH_RANGE("scatter")
    multi_gpu_scatter_lwe_async<Torus>(
-        active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in.get(),
+        active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
        lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
        active_streams.count(), num_blocks_to_process,
        mem_ptr->params.small_lwe_dimension + 1);
-    POP_RANGE()

    execute_pbs_async<Torus, Torus>(
        active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
        lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
-        lwe_trivial_indexes_vec, bsks, lut->buffer,
+        lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
        mem_ptr->params.glwe_dimension, mem_ptr->params.small_lwe_dimension,
        mem_ptr->params.polynomial_size, mem_ptr->params.pbs_base_log,
        mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
        num_blocks_to_process, mem_ptr->params.pbs_type, 1, 0);

-    PUSH_RANGE("gather")
    multi_gpu_gather_lwe_async<Torus>(
        active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
        lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
        lut->lwe_aligned_vec, num_blocks_to_process,
        mem_ptr->params.big_lwe_dimension + 1);
-    POP_RANGE()
-    lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
-        active_streams);
+
+    // other gpus record their events
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
+                        streams.gpu_index(j));
+    }
+    // GPU 0 waits for all
+    for (int j = 1; j < active_streams.count(); j++) {
+      cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
+                             streams.gpu_index(0));
+    }
  }

  for (uint32_t i = 0; i < num_blocks_to_process; i++) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -7,13 +7,6 @@
 #include "utils/helper_profile.cuh"
 #include "utils/kernel_dimensions.cuh"

-inline CudaLweCiphertextListFFI
-to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
-  return {.ptr = radix->ptr,
-          .num_radix_blocks = radix->num_radix_blocks,
-          .lwe_dimension = radix->lwe_dimension};
-}
-
 template <typename Torus>
 void create_zero_radix_ciphertext_async(cudaStream_t const stream,
                                        uint32_t const gpu_index,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -4,13 +4,15 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_radix_scalar_bitop_kb<uint64_t>(
      CudaStreams(streams), lwe_array_out, lwe_array_input,
      static_cast<const uint64_t *>(clear_blocks),
      static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
-      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      ms_noise_reduction_key);
 }

 void update_degrees_after_scalar_bitand(uint64_t *output_degrees,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -9,7 +9,8 @@ __host__ void host_integer_radix_scalar_bitop_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *output,
    CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
    Torus const *h_clear_blocks, uint32_t num_clear_blocks,
-    int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
+    int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  if (output->num_radix_blocks != input->num_radix_blocks)
    PANIC("Cuda error: input and output num radix blocks must be equal")
@@ -49,7 +50,8 @@ __host__ void host_integer_radix_scalar_bitop_kb(
    lut->broadcast_lut(active_streams, false);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, output, input, bsks, ksks, lut, num_clear_blocks);
+        streams, output, input, bsks, ksks, ms_noise_reduction_key, lut,
+        num_clear_blocks);
    memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));

    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -35,7 +35,9 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
    void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_scalar_blocks) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_scalar_blocks) {

  // The output ciphertext might be a boolean block or a radix ciphertext
  // depending on the case (eq/gt vs max/min) so the amount of blocks to
@@ -49,7 +51,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    host_integer_radix_scalar_equality_check_kb<uint64_t>(
        CudaStreams(streams), lwe_array_out, lwe_array_in,
        static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
-        (uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
+        (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
+        num_scalar_blocks);
    break;
  case GT:
  case GE:
@@ -63,7 +66,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
        static_cast<const uint64_t *>(scalar_blocks),
        static_cast<const uint64_t *>(h_scalar_blocks), buffer,
        buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
-        num_radix_blocks, num_scalar_blocks);
+        ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
    break;
  case MAX:
  case MIN:
@@ -74,7 +77,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
        CudaStreams(streams), lwe_array_out, lwe_array_in,
        static_cast<const uint64_t *>(scalar_blocks),
        static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
-        (uint64_t **)(ksks), num_radix_blocks, num_scalar_blocks);
+        (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
+        num_scalar_blocks);
    break;
  default:
    PANIC("Cuda error: integer operation not supported")
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -29,7 +29,9 @@ __host__ void scalar_compare_radix_blocks_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks) {

  if (num_radix_blocks == 0)
    return;
@@ -69,8 +71,8 @@ __host__ void scalar_compare_radix_blocks_kb(
  // Apply LUT to compare to 0
  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, lwe_array_out, subtracted_blocks, bsks, ksks, sign_lut,
-      num_radix_blocks);
+      streams, lwe_array_out, subtracted_blocks, bsks, ksks,
+      ms_noise_reduction_key, sign_lut, num_radix_blocks);

  // FIXME: without this sync signed scalar eq tests fail, I don't understand
  // the reason
@@ -88,7 +90,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
    std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input lwe dimensions must be the same")
  if (lwe_array_in->num_radix_blocks < num_radix_blocks)
@@ -128,10 +132,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    // means scalar is zero
    host_compare_blocks_with_zero<Torus>(
        streams, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
-        num_radix_blocks, mem_ptr->is_zero_lut);
+        ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
    are_all_comparisons_block_true<Torus>(
        streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_lwe_array_out,
-        mem_ptr, bsks, ksks, mem_ptr->tmp_lwe_array_out->num_radix_blocks);
+        mem_ptr, bsks, ksks, ms_noise_reduction_key,
+        mem_ptr->tmp_lwe_array_out->num_radix_blocks);

    auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
      x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -149,7 +154,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    lut->broadcast_lut(active_streams);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);
+        streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks,
+        ms_noise_reduction_key, lut, 1);

  } else if (num_scalar_blocks < num_radix_blocks) {
    // We have to handle both part of the work described above
@@ -201,7 +207,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    auto comparisons = mem_ptr->tmp_block_comparisons;
    scalar_compare_radix_blocks_kb<Torus>(
        lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
-        mem_ptr, bsks, ksks, num_lsb_radix_blocks);
+        mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
@@ -209,15 +215,15 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
                               mem_ptr->diff_buffer->tree_buffer,
                               mem_ptr->identity_lut_f, bsks, ksks,
-                               num_lsb_radix_blocks);
+                               ms_noise_reduction_key, num_lsb_radix_blocks);
    //////////////
    // msb
    host_compare_blocks_with_zero<Torus>(
        msb_streams, &lwe_array_msb_out, &msb, mem_ptr, bsks, ksks,
-        num_msb_radix_blocks, mem_ptr->is_zero_lut);
+        ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
    are_all_comparisons_block_true<Torus>(
        msb_streams, &lwe_array_msb_out, &lwe_array_msb_out, mem_ptr, bsks,
-        ksks, lwe_array_msb_out.num_radix_blocks);
+        ksks, ms_noise_reduction_key, lwe_array_msb_out.num_radix_blocks);
    lsb_streams.synchronize();
    msb_streams.synchronize();

@@ -244,7 +250,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
-        ksks, lut, 1, lut->params.message_modulus);
+        ksks, ms_noise_reduction_key, lut, 1, lut->params.message_modulus);

  } else {
    if (num_radix_blocks == 1) {
@@ -277,7 +283,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
+          streams, lwe_array_out, lwe_array_in, bsks, ksks,
+          ms_noise_reduction_key, one_block_lut, 1);
      one_block_lut->release(streams);
      delete one_block_lut;
    } else {
@@ -307,7 +314,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
      auto comparisons = mem_ptr->tmp_lwe_array_out;
      scalar_compare_radix_blocks_kb<Torus>(
          streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
-          mem_ptr, bsks, ksks, num_lsb_radix_blocks);
+          mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);

      // Reduces a vec containing radix blocks that encrypts a sign
      // (inferior, equal, superior) to one single radix block containing the
@@ -315,7 +322,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
      tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
                                 mem_ptr->diff_buffer->tree_buffer,
                                 sign_handler_f, bsks, ksks,
-                                 num_lsb_radix_blocks);
+                                 ms_noise_reduction_key, num_lsb_radix_blocks);
    }
  }
 }
@@ -326,7 +333,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
    std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input lwe dimensions must be the same")
@@ -367,10 +376,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
    host_compare_blocks_with_zero<Torus>(
        streams, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, ksks,
-        num_radix_blocks, mem_ptr->is_zero_lut);
+        ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
    are_all_comparisons_block_true<Torus>(
        streams, are_all_msb_zeros, are_all_msb_zeros, mem_ptr, bsks, ksks,
-        are_all_msb_zeros->num_radix_blocks);
+        ms_noise_reduction_key, are_all_msb_zeros->num_radix_blocks);
    CudaRadixCiphertextFFI sign_block;
    as_radix_ciphertext_slice<Torus>(&sign_block, lwe_array_in,
                                     num_radix_blocks - 1, num_radix_blocks);
@@ -421,8 +430,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    lut->broadcast_lut(active_streams);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks, lut,
-        1, lut->params.message_modulus);
+        streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks,
+        ms_noise_reduction_key, lut, 1, lut->params.message_modulus);

  } else if (num_scalar_blocks < num_radix_blocks) {
    // We have to handle both part of the work described above
@@ -468,7 +477,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto comparisons = mem_ptr->tmp_block_comparisons;
    scalar_compare_radix_blocks_kb<Torus>(
        lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
-        mem_ptr, bsks, ksks, num_lsb_radix_blocks);
+        mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);

    // Reduces a vec containing radix blocks that encrypts a sign
    // (inferior, equal, superior) to one single radix block containing the
@@ -476,17 +485,17 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
                               mem_ptr->diff_buffer->tree_buffer,
                               mem_ptr->identity_lut_f, bsks, ksks,
-                               num_lsb_radix_blocks);
+                               ms_noise_reduction_key, num_lsb_radix_blocks);
    //////////////
    // msb
    // We remove the last block (which is the sign)
    auto are_all_msb_zeros = lwe_array_msb_out;
    host_compare_blocks_with_zero<Torus>(
        msb_streams, &are_all_msb_zeros, &msb, mem_ptr, bsks, ksks,
-        num_msb_radix_blocks, mem_ptr->is_zero_lut);
+        ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
    are_all_comparisons_block_true<Torus>(
        msb_streams, &are_all_msb_zeros, &are_all_msb_zeros, mem_ptr, bsks,
-        ksks, are_all_msb_zeros.num_radix_blocks);
+        ksks, ms_noise_reduction_key, are_all_msb_zeros.num_radix_blocks);

    auto sign_bit_pos = (int)log2(message_modulus) - 1;

@@ -527,14 +536,15 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        &sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
-        ksks, signed_msb_lut, 1, signed_msb_lut->params.message_modulus);
+        ksks, ms_noise_reduction_key, signed_msb_lut, 1,
+        signed_msb_lut->params.message_modulus);
    lsb_streams.synchronize();
    msb_streams.synchronize();

    //////////////
    // Reduce the two blocks into one final
    reduce_signs<Torus>(streams, lwe_array_out, lwe_array_lsb_out, mem_ptr,
-                        sign_handler_f, bsks, ksks, 2);
+                        sign_handler_f, bsks, ksks, ms_noise_reduction_key, 2);

  } else {
    if (num_radix_blocks == 1) {
@@ -569,7 +579,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
+          streams, lwe_array_out, lwe_array_in, bsks, ksks,
+          ms_noise_reduction_key, one_block_lut, 1);
      one_block_lut->release(streams);
      delete one_block_lut;
    } else {
@@ -608,7 +619,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
      // - 2 if lhs > rhs
      scalar_compare_radix_blocks_kb<Torus>(
          lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
-          (Torus *)rhs.ptr, mem_ptr, bsks, ksks, num_lsb_radix_blocks);
+          (Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
+          num_lsb_radix_blocks);
      CudaRadixCiphertextFFI encrypted_sign_block;
      as_radix_ciphertext_slice<Torus>(&encrypted_sign_block, lwe_array_in,
                                       num_radix_blocks - 1, num_radix_blocks);
@@ -624,8 +636,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
-          trivial_sign_block, bsks, ksks, mem_ptr->signed_lut, 1,
-          mem_ptr->signed_lut->params.message_modulus);
+          trivial_sign_block, bsks, ksks, ms_noise_reduction_key,
+          mem_ptr->signed_lut, 1, mem_ptr->signed_lut->params.message_modulus);
      lsb_streams.synchronize();
      msb_streams.synchronize();

@@ -633,7 +645,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
      // (inferior, equal, superior) to one single radix block containing the
      // final sign
      reduce_signs<Torus>(streams, lwe_array_out, lwe_array_ct_out, mem_ptr,
-                          sign_handler_f, bsks, ksks, num_lsb_radix_blocks + 1);
+                          sign_handler_f, bsks, ksks, ms_noise_reduction_key,
+                          num_lsb_radix_blocks + 1);
    }
  }
 }
@@ -644,7 +657,9 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
    std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input lwe dimensions must be the same")
@@ -656,13 +671,13 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
    // is signed and scalar is positive
    integer_radix_signed_scalar_difference_check_kb<Torus>(
        streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
-        mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
-        num_scalar_blocks);
+        mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
+        num_radix_blocks, num_scalar_blocks);
  } else {
    integer_radix_unsigned_scalar_difference_check_kb<Torus>(
        streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
-        mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
-        num_scalar_blocks);
+        mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
+        num_radix_blocks, num_scalar_blocks);
  }
 }

@@ -671,8 +686,9 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks,
-    uint32_t num_scalar_blocks) {
+    void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -690,7 +706,8 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
  auto sign = mem_ptr->tmp_lwe_array_out;
  host_integer_radix_scalar_difference_check_kb<Torus>(
      streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
-      mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks, num_scalar_blocks);
+      mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
+      num_radix_blocks, num_scalar_blocks);

  // There is no optimized CMUX for scalars, so we convert to a trivial
  // ciphertext
@@ -704,9 +721,10 @@ __host__ void host_integer_radix_scalar_maxmin_kb(

  // Selector
  // CMUX for Max or Min
-  host_integer_radix_cmux_kb<Torus>(
-      streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
+  host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
+                                    mem_ptr->tmp_lwe_array_out, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, ms_noise_reduction_key);
 }

 template <typename Torus>
@@ -714,7 +732,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {

  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
    PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -787,7 +807,8 @@ __host__ void host_integer_radix_scalar_equality_check_kb(

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
-        bsks, ksks, scalar_comparison_luts, num_halved_lsb_radix_blocks);
+        bsks, ksks, ms_noise_reduction_key, scalar_comparison_luts,
+        num_halved_lsb_radix_blocks);
  }
  //////////////
  // msb_in
@@ -804,12 +825,12 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
      PANIC("Cuda error: integer operation not supported")
    }

-    host_compare_blocks_with_zero<Torus>(msb_streams, &msb_out, &msb_in,
-                                         mem_ptr, bsks, ksks,
-                                         num_msb_radix_blocks, msb_lut);
-    are_all_comparisons_block_true<Torus>(msb_streams, &msb_out, &msb_out,
-                                          mem_ptr, bsks, ksks,
-                                          msb_out.num_radix_blocks);
+    host_compare_blocks_with_zero<Torus>(
+        msb_streams, &msb_out, &msb_in, mem_ptr, bsks, ksks,
+        ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
+    are_all_comparisons_block_true<Torus>(
+        msb_streams, &msb_out, &msb_out, mem_ptr, bsks, ksks,
+        ms_noise_reduction_key, msb_out.num_radix_blocks);
  }

  lsb_streams.synchronize();
@@ -819,11 +840,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
  case COMPARISON_TYPE::EQ:
    are_all_comparisons_block_true<Torus>(
        streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
+        ms_noise_reduction_key,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  case COMPARISON_TYPE::NE:
    is_at_least_one_comparisons_block_true<Torus>(
        streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
+        ms_noise_reduction_key,
        num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
    break;
  default:
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cu
@@ -24,12 +24,13 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
 void cuda_integer_unsigned_scalar_div_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
    const CudaScalarDivisorFFI *scalar_divisor_ffi) {

  host_integer_unsigned_scalar_div_radix<uint64_t>(
      CudaStreams(streams), numerator_ct,
      (int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      scalar_divisor_ffi);
+      ms_noise_reduction_key, scalar_divisor_ffi);
 }

 void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
@@ -68,12 +69,13 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
 void cuda_integer_signed_scalar_div_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {

  host_integer_signed_scalar_div_radix_kb<uint64_t>(
      CudaStreams(streams), numerator_ct,
      (int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      scalar_divisor_ffi, numerator_bits);
+      ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
 }

 void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
@@ -113,7 +115,9 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
 void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint64_t const *divisor_has_at_least_one_set,
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    void const *clear_blocks, void const *h_clear_blocks,
@@ -122,9 +126,9 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
  host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), quotient_ct, remainder_ct,
      (int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
-      decomposed_divisor, num_scalars_divisor, (uint64_t *)clear_blocks,
-      (uint64_t *)h_clear_blocks, num_clear_blocks);
+      (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
+      divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
+      (uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
 }

 void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
@@ -164,7 +168,9 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
 void cuda_integer_signed_scalar_div_rem_radix_kb_64(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint64_t const *divisor_has_at_least_one_set,
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    uint32_t numerator_bits) {
@@ -172,8 +178,9 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
  host_integer_signed_scalar_div_rem_radix<uint64_t>(
      CudaStreams(streams), quotient_ct, remainder_ct,
      (int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
-      decomposed_divisor, num_scalars_divisor, numerator_bits);
+      (uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
+      divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
+      numerator_bits);
 }

 void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_div.cuh
@@ -27,7 +27,9 @@ template <typename Torus>
 __host__ void host_integer_unsigned_scalar_div_radix(
    CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
    int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi) {

  if (scalar_divisor_ffi->is_abs_divisor_one) {
    return;
@@ -36,7 +38,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
  if (scalar_divisor_ffi->is_divisor_pow2) {
    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
        streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks,
+        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
        numerator_ct->num_radix_blocks);
    return;
  }
@@ -63,24 +65,26 @@ __host__ void host_integer_unsigned_scalar_div_radix(
                                       numerator_cpy, numerator_ct);

    host_integer_radix_scalar_mul_high_kb<Torus>(
-        streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks, bsks,
-        scalar_divisor_ffi);
+        streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks,
+        ms_noise_reduction_key, bsks, scalar_divisor_ffi);

    host_sub_and_propagate_single_carry<Torus>(
        streams, numerator_ct, numerator_cpy, nullptr, nullptr,
-        mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
+        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
+        FLAG_NONE, (uint32_t)0);

    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
        streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
-        bsks, ksks, numerator_ct->num_radix_blocks);
+        bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);

    host_add_and_propagate_single_carry<Torus>(
        streams, numerator_ct, numerator_cpy, nullptr, nullptr,
-        mem_ptr->scp_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
+        mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
+        (uint32_t)0);

    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
        streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks,
+        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
        numerator_ct->num_radix_blocks);

    return;
@@ -88,16 +92,16 @@ __host__ void host_integer_unsigned_scalar_div_radix(

  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
      streams, numerator_ct, scalar_divisor_ffi->shift_pre,
-      mem_ptr->logical_scalar_shift_mem, bsks, ksks,
+      mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
      numerator_ct->num_radix_blocks);

-  host_integer_radix_scalar_mul_high_kb<Torus>(streams, numerator_ct,
-                                               mem_ptr->scalar_mul_high_mem,
-                                               ksks, bsks, scalar_divisor_ffi);
+  host_integer_radix_scalar_mul_high_kb<Torus>(
+      streams, numerator_ct, mem_ptr->scalar_mul_high_mem, ksks,
+      ms_noise_reduction_key, bsks, scalar_divisor_ffi);

  host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
      streams, numerator_ct, scalar_divisor_ffi->shift_post,
-      mem_ptr->logical_scalar_shift_mem, bsks, ksks,
+      mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
      numerator_ct->num_radix_blocks);
 }

@@ -121,8 +125,9 @@ template <typename Torus>
 __host__ void host_integer_signed_scalar_div_radix_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
    int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    uint32_t numerator_bits) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {

  if (scalar_divisor_ffi->is_abs_divisor_one) {
    if (scalar_divisor_ffi->is_divisor_negative) {
@@ -153,20 +158,23 @@ __host__ void host_integer_signed_scalar_div_radix_kb(

    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);

    host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
        streams, tmp,
        numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks, tmp->num_radix_blocks);
+        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
+        tmp->num_radix_blocks);

    host_add_and_propagate_single_carry<Torus>(
        streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
-        ksks, FLAG_NONE, (uint32_t)0);
+        ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);

    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);

  } else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
@@ -174,11 +182,12 @@ __host__ void host_integer_signed_scalar_div_radix_kb(

    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
        streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
-        bsks);
+        ms_noise_reduction_key, bsks);

    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->shift_post,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);

    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
@@ -186,11 +195,12 @@ __host__ void host_integer_signed_scalar_div_radix_kb(

    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
        streams, xsign, numerator_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);

    host_sub_and_propagate_single_carry<Torus>(
        streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
-        bsks, ksks, FLAG_NONE, (uint32_t)0);
+        bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);

  } else {

@@ -199,15 +209,16 @@ __host__ void host_integer_signed_scalar_div_radix_kb(

    host_integer_radix_signed_scalar_mul_high_kb<Torus>(
        streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
-        bsks);
+        ms_noise_reduction_key, bsks);

    host_add_and_propagate_single_carry<Torus>(
        streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
-        ksks, FLAG_NONE, (uint32_t)0);
+        ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);

    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
        streams, tmp, scalar_divisor_ffi->shift_post,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);

    CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
@@ -215,11 +226,12 @@ __host__ void host_integer_signed_scalar_div_radix_kb(

    host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
        streams, xsign, numerator_bits - 1,
-        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
+        mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
+        ms_noise_reduction_key);

    host_sub_and_propagate_single_carry<Torus>(
        streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
-        bsks, ksks, FLAG_NONE, (uint32_t)0);
+        bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
  }

  if (scalar_divisor_ffi->is_divisor_negative) {
@@ -251,7 +263,9 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
    CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct,
    int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint64_t const *divisor_has_at_least_one_set,
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    Torus const *clear_blocks, Torus const *h_clear_blocks,
@@ -261,17 +275,18 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                     numerator_ct, quotient_ct);

-  host_integer_unsigned_scalar_div_radix(streams, quotient_ct,
-                                         mem_ptr->unsigned_div_mem, bsks, ksks,
-                                         scalar_divisor_ffi);
+  host_integer_unsigned_scalar_div_radix(
+      streams, quotient_ct, mem_ptr->unsigned_div_mem, bsks, ksks,
+      ms_noise_reduction_key, scalar_divisor_ffi);

  if (scalar_divisor_ffi->is_divisor_pow2) {

    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       remainder_ct, numerator_ct);
-    host_integer_radix_scalar_bitop_kb(
-        streams, remainder_ct, remainder_ct, clear_blocks, h_clear_blocks,
-        num_clear_blocks, mem_ptr->bitop_mem, bsks, ksks);
+    host_integer_radix_scalar_bitop_kb(streams, remainder_ct, remainder_ct,
+                                       clear_blocks, h_clear_blocks,
+                                       num_clear_blocks, mem_ptr->bitop_mem,
+                                       bsks, ksks, ms_noise_reduction_key);

  } else {
    if (!scalar_divisor_ffi->is_divisor_zero) {
@@ -284,13 +299,15 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
        host_integer_scalar_mul_radix<Torus>(
            streams, remainder_ct, decomposed_divisor,
            divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
-            mem_ptr->params.message_modulus, num_scalars_divisor);
+            ms_noise_reduction_key, mem_ptr->params.message_modulus,
+            num_scalars_divisor);
      }
    }

    host_sub_and_propagate_single_carry(
        streams, numerator_ct, remainder_ct, nullptr, nullptr,
-        mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
+        mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
+        FLAG_NONE, (uint32_t)0);

    copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                       remainder_ct, numerator_ct);
@@ -318,7 +335,9 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
    CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
    CudaRadixCiphertextFFI *remainder_ct,
    int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint64_t const *divisor_has_at_least_one_set,
    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
    uint32_t numerator_bits) {
@@ -327,13 +346,13 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                     numerator_ct, quotient_ct);

-  host_integer_signed_scalar_div_radix_kb(streams, quotient_ct,
-                                          mem_ptr->signed_div_mem, bsks, ksks,
-                                          scalar_divisor_ffi, numerator_bits);
+  host_integer_signed_scalar_div_radix_kb(
+      streams, quotient_ct, mem_ptr->signed_div_mem, bsks, ksks,
+      ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);

-  host_propagate_single_carry<Torus>(streams, quotient_ct, nullptr, nullptr,
-                                     mem_ptr->scp_mem, bsks, ksks, FLAG_NONE,
-                                     (uint32_t)0);
+  host_propagate_single_carry<Torus>(
+      streams, quotient_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
+      ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);

  if (!scalar_divisor_ffi->is_divisor_negative &&
      scalar_divisor_ffi->is_divisor_pow2) {
@@ -342,7 +361,7 @@ __host__ void host_integer_signed_scalar_div_rem_radix(

    host_integer_radix_logical_scalar_shift_kb_inplace(
        streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
-        mem_ptr->logical_scalar_shift_mem, bsks, ksks,
+        mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
        remainder_ct->num_radix_blocks);

  } else if (!scalar_divisor_ffi->is_divisor_zero) {
@@ -356,13 +375,15 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
      host_integer_scalar_mul_radix<Torus>(
          streams, remainder_ct, decomposed_divisor,
          divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
-          mem_ptr->params.message_modulus, num_scalars_divisor);
+          ms_noise_reduction_key, mem_ptr->params.message_modulus,
+          num_scalars_divisor);
    }
  }

  host_sub_and_propagate_single_carry(
      streams, numerator_ct, remainder_ct, nullptr, nullptr,
-      mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
+      mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
+      FLAG_NONE, (uint32_t)0);

  copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
                                     remainder_ct, numerator_ct);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -22,13 +22,15 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
-    int8_t *mem, void *const *bsks, void *const *ksks, uint32_t polynomial_size,
-    uint32_t message_modulus, uint32_t num_scalars) {
+    int8_t *mem, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {

  host_integer_scalar_mul_radix<uint64_t>(
      CudaStreams(streams), lwe_array, decomposed_scalar, has_at_least_one_set,
      reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
-      (uint64_t **)(ksks), message_modulus, num_scalars);
+      (uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
+      num_scalars);
 }

 void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -46,6 +46,7 @@ __host__ void host_integer_scalar_mul_radix(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
    T const *decomposed_scalar, T const *has_at_least_one_set,
    int_scalar_mul_buffer<T> *mem, void *const *bsks, T *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t message_modulus, uint32_t num_scalars) {

  auto num_radix_blocks = lwe_array->num_radix_blocks;
@@ -68,7 +69,7 @@ __host__ void host_integer_scalar_mul_radix(
          num_radix_blocks, lwe_array, 0, num_radix_blocks);
      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
          streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
-          bsks, ksks, num_radix_blocks);
+          bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
    } else {
      // create trivial assign for value = 0
      set_zero_radix_ciphertext_slice_async<T>(
@@ -112,14 +113,15 @@ __host__ void host_integer_scalar_mul_radix(
  } else {
    host_integer_partial_sum_ciphertexts_vec_kb<T>(
        streams, lwe_array, all_shifted_buffer, bsks, ksks,
-        mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
+        ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
+        j);

    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
    uint32_t uses_carry = 0;
-    host_propagate_single_carry<T>(streams, lwe_array, nullptr, nullptr,
-                                   scp_mem_ptr, bsks, ksks, requested_flag,
-                                   uses_carry);
+    host_propagate_single_carry<T>(
+        streams, lwe_array, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
+        ms_noise_reduction_key, requested_flag, uses_carry);
  }
 }

@@ -168,6 +170,7 @@ template <typename Torus>
 __host__ void host_integer_radix_scalar_mul_high_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *ct,
    int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {

  if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
@@ -188,7 +191,7 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
          streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
-          tmp_ffi->num_radix_blocks);
+          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);

    } else {

@@ -196,7 +199,8 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
          streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
          scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
          mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
-          mem_ptr->params.message_modulus, scalar_divisor_ffi->num_scalars);
+          ms_noise_reduction_key, mem_ptr->params.message_modulus,
+          scalar_divisor_ffi->num_scalars);
    }
  }

@@ -207,7 +211,9 @@ template <typename Torus>
 __host__ void host_integer_radix_signed_scalar_mul_high_kb(
    CudaStreams streams, CudaRadixCiphertextFFI *ct,
    int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
-    const CudaScalarDivisorFFI *scalar_divisor_ffi, void *const *bsks) {
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks) {

  if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
    set_zero_radix_ciphertext_slice_async<Torus>(
@@ -219,7 +225,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(

  host_extend_radix_with_sign_msb<Torus>(
      streams, tmp_ffi, ct, mem_ptr->extend_radix_mem, ct->num_radix_blocks,
-      bsks, (uint64_t **)ksks);
+      bsks, (uint64_t **)ksks, ms_noise_reduction_key);

  if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
      !scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
@@ -229,13 +235,14 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
          streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
-          tmp_ffi->num_radix_blocks);
+          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
    } else {
      host_integer_scalar_mul_radix<Torus>(
          streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
          scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
          mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
-          mem_ptr->params.message_modulus, scalar_divisor_ffi->num_scalars);
+          ms_noise_reduction_key, mem_ptr->params.message_modulus,
+          scalar_divisor_ffi->num_scalars);
    }
  }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -22,12 +22,13 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, n,
      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (uint64_t **)(ksks), ms_noise_reduction_key);
 }

 void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -28,7 +28,8 @@ template <typename Torus>
 __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
    int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto num_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
@@ -73,7 +74,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
-        lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
+        ms_noise_reduction_key, lut_bivariate, num_blocks,
+        lut_bivariate->params.message_modulus);

  } else {
    // rotate left as the blocks are from LSB to MSB
@@ -97,7 +99,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
-        lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
+        ms_noise_reduction_key, lut_bivariate, num_blocks,
+        lut_bivariate->params.message_modulus);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cu
@@ -26,12 +26,13 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
 /// rotations - 1 The remaining blocks are padded with zeros
 void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, shift,
      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), lwe_array->num_radix_blocks);
+      (uint64_t **)(ksks), ms_noise_reduction_key, lwe_array->num_radix_blocks);
 }

 uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
@@ -63,12 +64,13 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
 /// zeros as would be done in the logical shift.
 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, shift,
      (int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (uint64_t **)(ksks), ms_noise_reduction_key);
 }

 void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -28,7 +28,9 @@ template <typename Torus>
 __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
    int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks, uint32_t num_blocks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_blocks) {

  if (lwe_array->num_radix_blocks < num_blocks)
    PANIC("Cuda error: input does not have enough blocks")
@@ -79,8 +81,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, &partial_current_blocks, &partial_current_blocks,
-        &partial_previous_blocks, bsks, ksks, lut_bivariate,
-        partial_block_count, lut_bivariate->params.message_modulus);
+        &partial_previous_blocks, bsks, ksks, ms_noise_reduction_key,
+        lut_bivariate, partial_block_count,
+        lut_bivariate->params.message_modulus);

  } else {
    // right shift
@@ -110,8 +113,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, partial_current_blocks, partial_current_blocks,
-        &partial_next_blocks, bsks, ksks, lut_bivariate, partial_block_count,
-        lut_bivariate->params.message_modulus);
+        &partial_next_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
+        partial_block_count, lut_bivariate->params.message_modulus);
  }
 }

@@ -132,7 +135,8 @@ template <typename Torus>
 __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
    int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto num_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
@@ -201,8 +205,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(

        integer_radix_apply_bivariate_lookup_table_kb<Torus>(
            streams, partial_current_blocks, partial_current_blocks,
-            &partial_next_blocks, bsks, ksks, lut_bivariate,
-            partial_block_count, lut_bivariate->params.message_modulus);
+            &partial_next_blocks, bsks, ksks, ms_noise_reduction_key,
+            lut_bivariate, partial_block_count,
+            lut_bivariate->params.message_modulus);
      }
      // Since our CPU threads will be working on different streams we shall
      // Ensure the work in the main stream is completed
@@ -211,7 +216,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
          mem->lut_buffers_univariate[num_bits_in_block - 1];
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
-          lut_univariate_padding_block, 1);
+          ms_noise_reduction_key, lut_univariate_padding_block, 1);
      // Replace blocks 'pulled' from the left with the correct padding
      // block
      for (uint i = 0; i < rotations; i++) {
@@ -225,7 +230,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
            mem->lut_buffers_univariate[shift_within_block - 1];
        integer_radix_apply_univariate_lookup_table_kb<Torus>(
            mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
-            lut_univariate_shift_last_block, 1);
+            ms_noise_reduction_key, lut_univariate_shift_last_block, 1);
      }

      mem->local_streams_1.synchronize();
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
@@ -22,12 +22,13 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
    CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
      CudaStreams(streams), lwe_array, lwe_shift,
      (int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (uint64_t **)(ksks), ms_noise_reduction_key);
 }

 void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -29,7 +29,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
    CudaRadixCiphertextFFI const *lwe_shift,
    int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+    Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  cuda_set_device(streams.gpu_index(0));

  if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
@@ -56,6 +57,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  // Extract all bits
  auto bits = mem->tmp_bits;
  extract_n_bits<Torus>(streams, bits, lwe_array, bsks, ksks,
+                        ms_noise_reduction_key,
                        num_radix_blocks * bits_per_block, num_radix_blocks,
                        mem->bit_extract_luts);

@@ -77,8 +79,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  // so that it is already aligned to the correct position of the cmux input
  // and we reduce noise growth
  extract_n_bits<Torus>(streams, shift_bits, lwe_shift, bsks, ksks,
-                        max_num_bits_that_tell_shift, num_radix_blocks,
-                        mem->bit_extract_luts_with_offset_2);
+                        ms_noise_reduction_key, max_num_bits_that_tell_shift,
+                        num_radix_blocks, mem->bit_extract_luts_with_offset_2);

  // If signed, do an "arithmetic shift" by padding with the sign bit
  CudaRadixCiphertextFFI last_bit;
@@ -150,7 +152,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // control_bit|b|a
    host_pack_bivariate_blocks<Torus>(
        streams, mux_inputs, mux_lut->lwe_indexes_out, rotated_input,
-        input_bits_a, mux_lut->lwe_indexes_in.get(), 2, total_nb_bits,
+        input_bits_a, mux_lut->lwe_indexes_in, 2, total_nb_bits,
        mem->params.message_modulus, mem->params.carry_modulus);

    // The shift bit is already properly aligned/positioned
@@ -161,7 +163,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // we have
    // control_bit|b|a
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, input_bits_a, mux_inputs, bsks, ksks, mux_lut, total_nb_bits);
+        streams, input_bits_a, mux_inputs, bsks, ksks, ms_noise_reduction_key,
+        mux_lut, total_nb_bits);
  }

  // Initializes the output
@@ -193,8 +196,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, lwe_array, lwe_array, bsks, ksks, cleaning_lut,
-        num_radix_blocks);
+        streams, lwe_array, lwe_array, bsks, ksks, ms_noise_reduction_key,
+        cleaning_lut, num_radix_blocks);
  }
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
@@ -23,12 +23,14 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
    CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry) {
  PUSH_RANGE("sub")
  host_sub_and_propagate_single_carry<uint64_t>(
      CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
      (int_sub_and_propagate<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
-      requested_flag, uses_carry);
+      ms_noise_reduction_key, requested_flag, uses_carry);
  POP_RANGE()
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -33,6 +33,7 @@ void host_sub_and_propagate_single_carry(
    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *input_carries,
    int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t requested_flag, uint32_t uses_carry) {

  host_integer_radix_negation<Torus>(
@@ -41,7 +42,8 @@ void host_sub_and_propagate_single_carry(

  host_add_and_propagate_single_carry<Torus>(
      streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
-      mem->sc_prop_mem, bsks, ksks, requested_flag, uses_carry);
+      mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key, requested_flag,
+      uses_carry);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -204,20 +204,20 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_2_2_params(
 }

 template <typename InputTorus, typename OutputTorus>
-void execute_pbs_async(CudaStreams streams,
-                       const LweArrayVariant<OutputTorus> &lwe_array_out,
-                       const LweArrayVariant<InputTorus> &lwe_output_indexes,
-                       const std::vector<OutputTorus *> lut_vec,
-                       const std::vector<InputTorus *> lut_indexes_vec,
-                       const LweArrayVariant<InputTorus> &lwe_array_in,
-                       const LweArrayVariant<InputTorus> &lwe_input_indexes,
-                       void *const *bootstrapping_keys,
-                       std::vector<int8_t *> pbs_buffer,
-                       uint32_t glwe_dimension, uint32_t lwe_dimension,
-                       uint32_t polynomial_size, uint32_t base_log,
-                       uint32_t level_count, uint32_t grouping_factor,
-                       uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
-                       uint32_t num_many_lut, uint32_t lut_stride) {
+void execute_pbs_async(
+    CudaStreams streams, const LweArrayVariant<OutputTorus> &lwe_array_out,
+    const LweArrayVariant<InputTorus> &lwe_output_indexes,
+    const std::vector<OutputTorus *> lut_vec,
+    const std::vector<InputTorus *> lut_indexes_vec,
+    const LweArrayVariant<InputTorus> &lwe_array_in,
+    const LweArrayVariant<InputTorus> &lwe_input_indexes,
+    void *const *bootstrapping_keys,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
+    uint32_t num_many_lut, uint32_t lut_stride) {

  if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
    // 32 bits
@@ -310,13 +310,17 @@ void execute_pbs_async(CudaStreams streams,
        auto d_lut_vector_indexes =
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);

+        void *zeros = nullptr;
+        if (ms_noise_reduction_key != nullptr &&
+            ms_noise_reduction_key->ptr != nullptr)
+          zeros = ms_noise_reduction_key->ptr[i];
        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
            streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
            current_lwe_array_in, current_lwe_input_indexes,
-            bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            num_many_lut, lut_stride);
+            bootstrapping_keys[i], ms_noise_reduction_key, zeros, pbs_buffer[i],
+            lwe_dimension, glwe_dimension, polynomial_size, base_log,
+            level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
      }
      break;
    default:
@@ -370,11 +374,16 @@ void execute_pbs_async(CudaStreams streams,
        auto d_lut_vector_indexes =
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);

+        void *zeros = nullptr;
+        if (ms_noise_reduction_key != nullptr &&
+            ms_noise_reduction_key->ptr != nullptr)
+          zeros = ms_noise_reduction_key->ptr[i];
        cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
            streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
            lut_vec[i], current_lwe_array_in, bootstrapping_keys[i],
-            pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
-            base_log, level_count, num_inputs_on_gpu);
+            ms_noise_reduction_key, zeros, pbs_buffer[i], lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count,
+            num_inputs_on_gpu);
      }
      break;
    default:
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -80,7 +80,9 @@ __global__ void device_programmable_bootstrap_cg(
  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
  const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+      (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
+          ? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
+          : &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -650,15 +650,33 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
    void const *lwe_output_indexes, void const *lut_vector,
    void const *lut_vector_indexes, void const *lwe_array_in,
    void const *lwe_input_indexes, void const *bootstrapping_key,
-    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *ms_drift_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride) {
  if (base_log > 64)
    PANIC("Cuda error (classical PBS): base log should be <= 64")

  pbs_buffer<uint64_t, CLASSICAL> *buffer =
      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;

+  // If the parameters contain drift noise reduction key, then apply it
+  if (buffer->noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
+    uint32_t log_modulus = log2(polynomial_size) + 1;
+    host_drift_modulus_switch<uint64_t>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer->temp_lwe_array_in,
+        static_cast<uint64_t const *>(lwe_array_in),
+        static_cast<uint64_t const *>(lwe_input_indexes),
+        static_cast<uint64_t *>(ms_drift_noise_reduction_ptr),
+        lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
+        ms_noise_reduction_key->ms_input_variance,
+        ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
+        log_modulus);
+  } else {
+    buffer->temp_lwe_array_in =
+        const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
+  }
  check_cuda_error(cudaGetLastError());

  switch (buffer->pbs_variant) {
@@ -669,7 +687,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
        static_cast<const uint64_t *>(lut_vector_indexes),
-        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
        static_cast<const uint64_t *>(lwe_input_indexes),
        static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
@@ -684,7 +702,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
        static_cast<const uint64_t *>(lut_vector_indexes),
-        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
        static_cast<const uint64_t *>(lwe_input_indexes),
        static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
@@ -696,7 +714,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
        static_cast<const uint64_t *>(lwe_output_indexes),
        static_cast<const uint64_t *>(lut_vector),
        static_cast<const uint64_t *>(lut_vector_indexes),
-        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
        static_cast<const uint64_t *>(lwe_input_indexes),
        static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -56,7 +56,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
  const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+      (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
+          ? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
+          : &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -36,7 +36,7 @@ uint64_t scratch_cuda_programmable_bootstrap_128(
 template <typename InputTorus>
 void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    __uint128_t const *lut_vector, InputTorus const *lwe_array_in,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
    double const *bootstrapping_key,
    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -83,7 +83,7 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
 template <typename InputTorus>
 void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    __uint128_t const *lut_vector, InputTorus const *lwe_array_in,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
    double const *bootstrapping_key,
    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -132,17 +132,36 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    __uint128_t const *lut_vector, void const *lwe_array_in,
    void const *bootstrapping_key,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_drift_noise_reduction_ptr,
+    void const *ms_noise_reduction_ptr,
    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
  if (base_log > 64)
    PANIC("Cuda error (classical PBS): base log should be <= 64")

+  // If the parameters contain drift noise reduction key, then apply it
+  if (buffer->noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
+    uint32_t log_modulus = log2(polynomial_size) + 1;
+    host_drift_modulus_switch<InputTorus>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<InputTorus *>(buffer->temp_lwe_array_in),
+        static_cast<InputTorus const *>(lwe_array_in),
+        static_cast<uint64_t const *>(buffer->trivial_indexes),
+        static_cast<const InputTorus *>(ms_noise_reduction_ptr),
+        lwe_dimension + 1, num_samples, ms_drift_noise_reduction_ptr->num_zeros,
+        ms_drift_noise_reduction_ptr->ms_input_variance,
+        ms_drift_noise_reduction_ptr->ms_r_sigma,
+        ms_drift_noise_reduction_ptr->ms_bound, log_modulus);
+  } else {
+    buffer->temp_lwe_array_in =
+        const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
+  }
  switch (buffer->pbs_variant) {
  case DEFAULT:
    executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-        lut_vector, static_cast<InputTorus const *>(lwe_array_in),
+        lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
@@ -150,7 +169,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
    executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
        InputTorus>(
        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-        lut_vector, static_cast<InputTorus const *>(lwe_array_in),
+        lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
    break;
@@ -215,7 +234,9 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
    void *streams, uint32_t gpu_index, void *lwe_array_out,
    void const *lut_vector, void const *lwe_array_in,
-    void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
+    void const *bootstrapping_key,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples) {
  pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
@@ -224,8 +245,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
  host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
      streams, gpu_index, lwe_array_out,
      static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
-      bootstrapping_key, buffer, lwe_dimension, glwe_dimension, polynomial_size,
-      base_log, level_count, num_samples);
+      bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
+      lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
+      num_samples);
 }

 /*
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -668,7 +668,7 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
 template <typename InputTorus, class params, bool first_iter>
 __host__ void execute_step_one_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
-    InputTorus const *lwe_array_in, double const *bootstrapping_key,
+    InputTorus *lwe_array_in, double const *bootstrapping_key,
    __uint128_t *global_accumulator, double *global_join_buffer,
    PBS_MS_REDUCTION_T noise_reduction_type,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
@@ -752,7 +752,7 @@ __host__ void execute_step_two_128(
 template <typename InputTorus, class params>
 __host__ void host_programmable_bootstrap_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    __uint128_t const *lut_vector, InputTorus const *lwe_array_in,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
    double const *bootstrapping_key,
    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -84,7 +84,9 @@ __global__ void device_programmable_bootstrap_tbc(
  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
  const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+      (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
+          ? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
+          : &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
@@ -43,7 +43,8 @@ uint64_t scratch_cuda_expand_without_verification_64(
 void cuda_expand_without_verification_64(
    CudaStreamsFFI streams, void *lwe_array_out,
    const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *computing_ksks, void *const *casting_keys) {
+    void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto expand_buffer = reinterpret_cast<zk_expand_mem<uint64_t> *>(mem_ptr);

@@ -53,49 +54,49 @@ void cuda_expand_without_verification_64(
        streams, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
        expand_buffer, (uint64_t **)casting_keys, bsks,
-        (uint64_t **)(computing_ksks));
+        (uint64_t **)(computing_ksks), ms_noise_reduction_key);
    break;
  case 512:
    host_expand_without_verification<uint64_t, AmortizedDegree<512>>(
        streams, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
        expand_buffer, (uint64_t **)casting_keys, bsks,
-        (uint64_t **)(computing_ksks));
+        (uint64_t **)(computing_ksks), ms_noise_reduction_key);
    break;
  case 1024:
    host_expand_without_verification<uint64_t, AmortizedDegree<1024>>(
        streams, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
        expand_buffer, (uint64_t **)casting_keys, bsks,
-        (uint64_t **)(computing_ksks));
+        (uint64_t **)(computing_ksks), ms_noise_reduction_key);
    break;
  case 2048:
    host_expand_without_verification<uint64_t, AmortizedDegree<2048>>(
        streams, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
        expand_buffer, (uint64_t **)casting_keys, bsks,
-        (uint64_t **)(computing_ksks));
+        (uint64_t **)(computing_ksks), ms_noise_reduction_key);
    break;
  case 4096:
    host_expand_without_verification<uint64_t, AmortizedDegree<4096>>(
        streams, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
        expand_buffer, (uint64_t **)casting_keys, bsks,
-        (uint64_t **)(computing_ksks));
+        (uint64_t **)(computing_ksks), ms_noise_reduction_key);
    break;
  case 8192:
    host_expand_without_verification<uint64_t, AmortizedDegree<8192>>(
        streams, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
        expand_buffer, (uint64_t **)casting_keys, bsks,
-        (uint64_t **)(computing_ksks));
+        (uint64_t **)(computing_ksks), ms_noise_reduction_key);
    break;
  case 16384:
    host_expand_without_verification<uint64_t, AmortizedDegree<16384>>(
        streams, static_cast<uint64_t *>(lwe_array_out),
        static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
        expand_buffer, (uint64_t **)casting_keys, bsks,
-        (uint64_t **)(computing_ksks));
+        (uint64_t **)(computing_ksks), ms_noise_reduction_key);
    break;
  default:
    PANIC("CUDA error: lwe_dimension not supported."
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -19,7 +19,8 @@ template <typename Torus, class params>
 __host__ void host_expand_without_verification(
    CudaStreams streams, Torus *lwe_array_out,
    const Torus *lwe_flattened_compact_array_in, zk_expand_mem<Torus> *mem_ptr,
-    Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks) {
+    Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
  // Expand
  auto casting_key_type = mem_ptr->casting_key_type;
  auto expanded_lwes = mem_ptr->tmp_expanded_lwes;
@@ -76,7 +77,7 @@ __host__ void host_expand_without_verification(

    // apply keyswitch to BIG
    execute_keyswitch_async<Torus>(
-        streams.get_ith(0), ksed_small_to_big_expanded_lwes,
+        streams.subset_first_gpu(), ksed_small_to_big_expanded_lwes,
        lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
        casting_keys, casting_input_dimension, casting_output_dimension,
        casting_ks_base_log, casting_ks_level, num_lwes);
@@ -95,8 +96,8 @@ __host__ void host_expand_without_verification(
  auto input = new CudaRadixCiphertextFFI;
  into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, output, input, bsks, ksks, message_and_carry_extract_luts,
-      2 * num_lwes);
+      streams, output, input, bsks, ksks, ms_noise_reduction_key,
+      message_and_carry_extract_luts, 2 * num_lwes);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -191,9 +191,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
          stream, gpu_index, (void *)d_lwe_ct_out_array,
          (void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
          (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
-          (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
-          lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, num_many_lut, lut_stride);
+          (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, nullptr, nullptr,
+          pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
+          pbs_base_log, pbs_level, number_of_inputs, num_many_lut, lut_stride);
      // Copy result back
      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                               (glwe_dimension * polynomial_size + 1) *
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -98,8 +98,37 @@ pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
 pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
 pub type PBS_TYPE = ffi::c_uint;
 pub const PBS_MS_REDUCTION_T_NO_REDUCTION: PBS_MS_REDUCTION_T = 0;
-pub const PBS_MS_REDUCTION_T_CENTERED: PBS_MS_REDUCTION_T = 1;
+pub const PBS_MS_REDUCTION_T_DRIFT: PBS_MS_REDUCTION_T = 1;
+pub const PBS_MS_REDUCTION_T_CENTERED: PBS_MS_REDUCTION_T = 2;
 pub type PBS_MS_REDUCTION_T = ffi::c_uint;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaModulusSwitchNoiseReductionKeyFFI {
+    pub ptr: *const *mut ffi::c_void,
+    pub num_zeros: u32,
+    pub ms_bound: f64,
+    pub ms_r_sigma: f64,
+    pub ms_input_variance: f64,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaModulusSwitchNoiseReductionKeyFFI"]
+        [::std::mem::size_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 40usize];
+    ["Alignment of CudaModulusSwitchNoiseReductionKeyFFI"]
+        [::std::mem::align_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 8usize];
+    ["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ptr"]
+        [::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ptr) - 0usize];
+    ["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::num_zeros"]
+        [::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, num_zeros) - 8usize];
+    ["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_bound"]
+        [::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_bound) - 16usize];
+    ["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_r_sigma"]
+        [::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_r_sigma) - 24usize];
+    ["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_input_variance"][::std::mem::offset_of!(
+        CudaModulusSwitchNoiseReductionKeyFFI,
+        ms_input_variance
+    ) - 32usize];
+};
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
 pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
 pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
@@ -252,55 +281,6 @@ const _: () = {
        divisor_has_more_bits_than_numerator
    ) - 60usize];
 };
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct CudaLweCiphertextListFFI {
-    pub ptr: *mut ffi::c_void,
-    pub num_radix_blocks: u32,
-    pub lwe_dimension: u32,
-}
-#[allow(clippy::unnecessary_operation, clippy::identity_op)]
-const _: () = {
-    ["Size of CudaLweCiphertextListFFI"]
-        [::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
-    ["Alignment of CudaLweCiphertextListFFI"]
-        [::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
-    ["Offset of field: CudaLweCiphertextListFFI::ptr"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
-    ["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
-    ["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
-        [::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
-};
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct CudaPackedGlweCiphertextListFFI {
-    pub ptr: *mut ffi::c_void,
-    pub storage_log_modulus: u32,
-    pub lwe_per_glwe: u32,
-    pub total_lwe_bodies_count: u32,
-    pub glwe_dimension: u32,
-    pub polynomial_size: u32,
-}
-#[allow(clippy::unnecessary_operation, clippy::identity_op)]
-const _: () = {
-    ["Size of CudaPackedGlweCiphertextListFFI"]
-        [::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
-    ["Alignment of CudaPackedGlweCiphertextListFFI"]
-        [::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
-    ["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
-        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
-};
 unsafe extern "C" {
    pub fn scratch_cuda_apply_univariate_lut_kb_64(
        streams: CudaStreamsFFI,
@@ -353,6 +333,7 @@ unsafe extern "C" {
        input_radix_lwe: *const CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        bsks: *const *mut ffi::c_void,
    );
 }
@@ -392,6 +373,7 @@ unsafe extern "C" {
        input_radix_lwe_2: *const CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        bsks: *const *mut ffi::c_void,
        num_radix_blocks: u32,
        shift: u32,
@@ -410,6 +392,7 @@ unsafe extern "C" {
        input_radix_lwe: *const CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        bsks: *const *mut ffi::c_void,
        num_luts: u32,
        lut_stride: u32,
@@ -440,6 +423,7 @@ unsafe extern "C" {
        input_blocks: *mut CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        bsks: *const *mut ffi::c_void,
        num_blocks: u32,
    );
@@ -479,6 +463,7 @@ unsafe extern "C" {
        is_bool_right: bool,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        mem_ptr: *mut i8,
        polynomial_size: u32,
        num_blocks: u32,
@@ -538,6 +523,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -570,6 +556,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -615,6 +602,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -655,6 +643,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -667,6 +656,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        num_scalar_blocks: u32,
    );
 }
@@ -704,6 +694,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -717,6 +708,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -753,6 +745,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -788,6 +781,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -849,6 +843,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        requested_flag: u32,
        uses_carry: u32,
    );
@@ -863,6 +858,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        requested_flag: u32,
        uses_carry: u32,
    );
@@ -908,6 +904,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        compute_overflow: u32,
        uses_input_borrow: u32,
    );
@@ -948,6 +945,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -986,6 +984,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        polynomial_size: u32,
        message_modulus: u32,
        num_scalars: u32,
@@ -1030,6 +1029,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -1064,6 +1064,7 @@ unsafe extern "C" {
        generates_or_propagates: *mut CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        bsks: *const *mut ffi::c_void,
        num_blocks: u32,
    );
@@ -1110,6 +1111,7 @@ unsafe extern "C" {
        is_signed: bool,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -1144,6 +1146,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        num_radix_blocks: u32,
    );
 }
@@ -1182,6 +1185,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        num_radix_blocks: u32,
    );
 }
@@ -1235,6 +1239,7 @@ unsafe extern "C" {
        input_radix_lwe: *const CudaRadixCiphertextFFI,
        mem_ptr: *mut i8,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        bsks: *const *mut ffi::c_void,
    );
 }
@@ -1276,6 +1281,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        requested_flag: u32,
        uses_carry: u32,
    );
@@ -1314,6 +1320,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        scalar_divisor_ffi: *const CudaScalarDivisorFFI,
    );
 }
@@ -1353,6 +1360,7 @@ unsafe extern "C" {
        num_additional_blocks: u32,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -1389,6 +1397,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        scalar_divisor_ffi: *const CudaScalarDivisorFFI,
        numerator_bits: u32,
    );
@@ -1429,6 +1438,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        scalar_divisor_ffi: *const CudaScalarDivisorFFI,
        divisor_has_at_least_one_set: *const u64,
        decomposed_divisor: *const u64,
@@ -1474,6 +1484,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
        scalar_divisor_ffi: *const CudaScalarDivisorFFI,
        divisor_has_at_least_one_set: *const u64,
        decomposed_divisor: *const u64,
@@ -1518,6 +1529,7 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -1539,6 +1551,7 @@ unsafe extern "C" {
        pbs_base_log: u32,
        grouping_factor: u32,
        num_blocks_to_process: u32,
+        num_blocks: u32,
        message_modulus: u32,
        carry_modulus: u32,
        pbs_type: PBS_TYPE,
@@ -1556,6 +1569,7 @@ unsafe extern "C" {
        num_blocks_to_process: u32,
        mem: *mut i8,
        bsks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -1597,11 +1611,61 @@ unsafe extern "C" {
        mem_ptr: *mut i8,
        bsks: *const *mut ffi::c_void,
        ksks: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
    pub fn cleanup_cuda_integer_ilog2_kb_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
 }
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaLweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub num_radix_blocks: u32,
+    pub lwe_dimension: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaLweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
+    ["Alignment of CudaLweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
+    ["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
+        [::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
+};
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct CudaPackedGlweCiphertextListFFI {
+    pub ptr: *mut ffi::c_void,
+    pub storage_log_modulus: u32,
+    pub lwe_per_glwe: u32,
+    pub total_lwe_bodies_count: u32,
+    pub glwe_dimension: u32,
+    pub polynomial_size: u32,
+}
+#[allow(clippy::unnecessary_operation, clippy::identity_op)]
+const _: () = {
+    ["Size of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
+    ["Alignment of CudaPackedGlweCiphertextListFFI"]
+        [::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
+    ["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
+        [::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
+};
 unsafe extern "C" {
    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
        streams: CudaStreamsFFI,
@@ -1729,78 +1793,6 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_aes_encrypt_64(
-        streams: CudaStreamsFFI,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-        noise_reduction_type: PBS_MS_REDUCTION_T,
-        num_aes_inputs: u32,
-        sbox_parallelism: u32,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_integer_aes_ctr_encrypt_64(
-        streams: CudaStreamsFFI,
-        output: *mut CudaRadixCiphertextFFI,
-        iv: *const CudaRadixCiphertextFFI,
-        round_keys: *const CudaRadixCiphertextFFI,
-        counter_bits_le_all_blocks: *const u64,
-        num_aes_inputs: u32,
-        mem_ptr: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_integer_aes_encrypt_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
-}
-unsafe extern "C" {
-    pub fn scratch_cuda_integer_key_expansion_64(
-        streams: CudaStreamsFFI,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-        noise_reduction_type: PBS_MS_REDUCTION_T,
-    ) -> u64;
-}
-unsafe extern "C" {
-    pub fn cuda_integer_key_expansion_64(
-        streams: CudaStreamsFFI,
-        expanded_keys: *mut CudaRadixCiphertextFFI,
-        key: *const CudaRadixCiphertextFFI,
-        mem_ptr: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-    );
-}
-unsafe extern "C" {
-    pub fn cleanup_cuda_integer_key_expansion_64(
-        streams: CudaStreamsFFI,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;
@@ -1841,6 +1833,7 @@ unsafe extern "C" {
        bsks: *const *mut ffi::c_void,
        computing_ksks: *const *mut ffi::c_void,
        casting_keys: *const *mut ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
    );
 }
 unsafe extern "C" {
@@ -2308,6 +2301,8 @@ unsafe extern "C" {
        lwe_array_in: *const ffi::c_void,
        lwe_input_indexes: *const ffi::c_void,
        bootstrapping_key: *const ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
+        ms_noise_reduction_ptr: *mut ffi::c_void,
        buffer: *mut i8,
        lwe_dimension: u32,
        glwe_dimension: u32,
@@ -2327,6 +2322,8 @@ unsafe extern "C" {
        lut_vector: *const ffi::c_void,
        lwe_array_in: *const ffi::c_void,
        bootstrapping_key: *const ffi::c_void,
+        ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
+        ms_noise_reduction_ptr: *const ffi::c_void,
        buffer: *mut i8,
        lwe_dimension: u32,
        glwe_dimension: u32,
--- a/backends/tfhe-cuda-backend/wrapper.h
+++ b/backends/tfhe-cuda-backend/wrapper.h
@@ -2,7 +2,6 @@
 #include "cuda/include/ciphertext.h"
 #include "cuda/include/integer/compression/compression.h"
 #include "cuda/include/integer/integer.h"
-#include "cuda/include/aes/aes.h"
 #include "cuda/include/zk/zk.h"
 #include "cuda/include/keyswitch/keyswitch.h"
 #include "cuda/include/keyswitch/ks_enums.h"
--- a/backends/tfhe-hpu-backend/Cargo.toml
+++ b/backends/tfhe-hpu-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-hpu-backend"
-version = "0.3.0"
+version = "0.2.0"
 edition = "2021"
 license = "BSD-3-Clause-Clear"
 description = "HPU implementation on FPGA of TFHE-rs primitives."
@@ -33,7 +33,7 @@ serde = { version = "1", features = ["derive"] }
 toml = { version = "0.8", features = [] }
 paste = "1.0.15"
 thiserror = "1.0.61"
-bytemuck = { workspace = true }
+bytemuck = "1.16.0"
 anyhow = "1.0.82"
 lazy_static = "1.4.0"
 rand = "0.8.5"
--- a/backends/tfhe-hpu-backend/README.md
+++ b/backends/tfhe-hpu-backend/README.md
@@ -65,12 +65,9 @@ HPU configuration knobs are gathered in a TOML configuration file. This file des
          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml",
          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml",
          "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml"]
-  polling_us=2
+  polling_us=10
 [fpga.ffi.V80] # Hardware properties
-  id="${V80_PCIE_DEV}"
-  board_sn="${V80_SERIAL_NUMBER}"
-  hpu_path="${HPU_BACKEND_DIR}/config_store/v80_archives/psi64.hpu"
-  ami_path="${AMI_PATH}/ami.ko"
+  ami_dev="/dev/ami1" # Name of ami device
  qdma_h2c="/dev/qdma${V80_PCIE_DEV}001-MM-0" # QDma host to card device
  qdma_c2h="/dev/qdma${V80_PCIE_DEV}001-MM-1" # QDma card to host device

@@ -210,26 +207,11 @@ In order to run those applications on hardware, user must build from the project
 > ```

 ``` bash
-$ cargo build --release --features="hpu-v80" --example hpu_hlapi --example hpu_bench
+cargo build --release --features="hpu-v80" --example hpu_hlapi --example hpu_bench
 # Correctly setup environment with setup_hpu.sh script
-$ source setup_hpu.sh --config v80 -p
-# Source Xilinx environment (2024 or 2025 version)
-$ source /opt/xilinx/Vivado/2024.2/settings64.sh
-$ xsdb -eval "connect;puts [lsort -unique [regex -all -inline {( XFL[A-Z0-9]*)} [targets -target-properties]]]"
-****** Xilinx hw_server v2024.2
-  **** Build date : Oct 29 2024 at 10:16:47
-    ** Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
-    ** Copyright 2022-2024 Advanced Micro Devices, Inc. All Rights Reserved.
-
-INFO: hw_server application started
-INFO: Use Ctrl-C to exit hw_server application
-
-INFO: To connect to this hw_server instance use url: TCP:127.0.0.1:3121
-
-{ XFL12E5XJHWLA}
-$ export V80_SERIAL_NUMBER=XFL12E5XJHWL
-$ ./target/release/examples/hpu_bench --integer-w 64 --integer-w 32 --iop MUL --iter 10
-$ ./target/release/examples/hpu_hlapi
+source setup_hpu.sh --config v80
+./target/release/examples/hpu_bench --integer-w 64 --integer-w 32 --iop MUL --iter 10
+./target/release/examples/hpu_hlapi
 ```

 > NB: Error that occurred when ".hpu" files weren't correctly fetch could be a bit enigmatic: `memory allocation of ... bytes failed`
@@ -238,31 +220,6 @@ $ ./target/release/examples/hpu_hlapi
 > make pull_hpu_files
 > ```

-> NB: tfhe-hpu-backend can only use one V80 board at this time but if you have several boards on your server you can do
-> ```bash
-> $ . setup_hpu.sh --config v80 -p
-> getopt: option requires an argument -- 'p'
-> Please select a device in following list (1st two digits):
-> 24:00.1 Processing accelerators: Xilinx Corporation Device 50b5
-> 61:00.1 Processing accelerators: Xilinx Corporation Device 50b5
-> $ . setup_hpu.sh --config v80 -p 61
-> $ source /opt/xilinx/Vivado/2024.2/settings64.sh
-> # if AMI driver is loaded and AMC version running is the expected one
-> $ cat /sys/module/ami/drivers/pci\:ami/0000\:61\:00.0/board_serial
-> XFL1UKRD42KW
-> # list serial number available on USB JTAG
-> $ xsdb -eval "connect;puts [lsort -unique [regex -all -inline {( XFL[A-Z0-9]*)} [targets -target-properties]]]"
-> ...
-> { XFL12E5XJHWLA} { XFL1UKRD42KWA}
-> $ export V80_SERIAL_NUMBER=XFL1UKRD42KW
-> $ ./target/release/examples/hpu_hlapi
-> ```
-
-> NB: By default setup_hpu.sh will set AMI_PATH to something like /opt/v80/ami/e55d02d where e55d02d is the git revision of AMI driver.
-> To run properly, You need to either place a compiled ami.ko from this revision in this directory or set AMI_PATH to your AVED extraction:
-> ```bash
-> export AMI_PATH=/home/user/AVED/sw/AMI/driver/
-> ```

 ## Test framework
 There is also a set of tests backed in tfhe-rs. Tests are gather in testbundle over various integer width.
@@ -278,7 +235,7 @@ Those tests have 5 sub-kind:
 Snippets below give some example of command that could be used for testing:
 ``` bash
 # Correctly setup environment with setup_hpu.sh script
-source setup_hpu.sh --config v80 -p
+source setup_hpu.sh --config v80 --init-qdma

 # Run all sub-kind for 64b integer width
 cargo test --release --features="hpu-v80" --test hpu -- u64
@@ -292,7 +249,7 @@ HPU is completely integrated in tfhe benchmark system. Performances results coul
 Three benchmarks could be started, through the following Makefile target for simplicity:
 ``` bash
 # Do not forget to correctly set environment before hand
-source setup_hpu.sh --config v80 -p
+source setup_hpu.sh --config v80 --init-qdma

 # Run hlapi benches
 make test_high_level_api_hpu
--- a/Show More
+++ b/Show More