feat(hpu): Add Shift/Rot/Min/Max operation for Hpu in hlapi bench

fix(hpu): Remove some hardcoded filename in tandem
Also enhance error handling related to user misconfiguration. And remove a bug with ami devn reading
2026-01-11 15:48:20 -05:00 · 2025-06-20 09:24:11 +02:00 · 2025-06-20 09:04:22 +02:00 · 2025-06-19 19:34:04 +02:00 · 2025-06-19 14:51:02 +02:00 · 2025-06-19 13:48:20 +02:00
734 changed files with 81680 additions and 7112 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.hpu filter=lfs diff=lfs merge=lfs -text
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -6,6 +6,7 @@ self-hosted-runner:
    - large_windows_16_latest
    - large_ubuntu_16
    - large_ubuntu_16-22.04
+    - v80-desktop
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -33,7 +33,9 @@ runs:
      if: inputs.github-instance == 'true'
      shell: bash
      run: |
-        TOOLKIT_VERSION="$(echo ${CUDA_VERSION} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
+        # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+        # shellcheck disable=SC2001
+        TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${env.CUDA_KEYRING_PACKAGE}
        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
        sha256sum -c checksum
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -67,7 +67,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -126,9 +126,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -174,7 +174,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -182,9 +182,11 @@ jobs:
        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
          make test_tfhe_csprng
+          make test_tfhe_csprng_big_endian

      - name: Run tfhe-zk-pok tests
-        if: needs.should-run.outputs.zk_pok_test == 'true'
+        # Always run it to catch non deterministic bugs earlier
+        # if: needs.should-run.outputs.zk_pok_test == 'true'
        run: |
          make test_zk_pok

@@ -272,9 +274,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -114,7 +114,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -142,9 +142,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -115,7 +115,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -147,9 +147,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -185,7 +185,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -254,9 +254,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -68,7 +68,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -123,9 +123,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -114,8 +117,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -95,15 +98,27 @@ jobs:
        env:
          REF_NAME: ${{ github.ref_name }}

-      - name: Parse swap request PBS counts
+      - name: Parse swap request update PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

-      - name: Parse swap claim PBS counts
+      - name: Parse swap request finalize PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_finalize_pbs_count.csv "${RESULTS_FILENAME}" \
+          --object-sizes \
+          --append-results
+
+      - name: Parse swap claim prepare PBS counts
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_prepare_pbs_count.csv "${RESULTS_FILENAME}" \
+          --object-sizes \
+          --append-results
+
+      - name: Parse swap claim update PBS counts
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

@@ -116,8 +131,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -59,14 +59,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -111,8 +114,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -46,15 +46,18 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+            echo "FAST_BENCH=TRUE";
          } >> "${GITHUB_ENV}"
-          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -93,8 +96,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -124,14 +130,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -159,7 +168,8 @@ jobs:
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
-      
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -170,8 +180,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -84,7 +84,7 @@ jobs:
        run: |
          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
          # shellcheck disable=SC2001
-          PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
+          PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\", \"/g')
          echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"

      - name: Set single operations flavor
@@ -120,25 +120,24 @@ jobs:
        env:
          INPUTS_PARAMS_TYPE: ${{ inputs.params_type }}

-
      - name: Set command output
        id: set_command
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

      - name: Set parameters types output
        id: set_params_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -227,6 +226,8 @@ jobs:
        include:
          - cuda: "12.2"
            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -237,18 +238,20 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      # Re-export environment variables as dependencies setup perform this task in the previous job.
      # Local env variables are cleaned at the end of each job.
      - name: Export CUDA variables
        shell: bash
        run: |
-          CUDA_PATH=/usr/local/cuda-${{ matrix.cuda }}
          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
@@ -258,13 +261,15 @@ jobs:
        shell: bash
        run: |
          {
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+          echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+          echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -317,8 +322,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -119,14 +119,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -167,8 +170,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -120,14 +120,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -168,8 +171,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -0,0 +1,94 @@
+# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
+name: Hpu Integer Benchmarks
+
+on:
+  workflow_dispatch:
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+
+permissions: {}
+
+jobs:
+  integer-benchmarks-hpu:
+    name: Execute integer & erc20 benchmarks for HPU backend
+    runs-on: v80-desktop
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    timeout-minutes: 1440  # 24 hours
+    steps:
+      # Needed as long as hw_regmap repository is private
+      - name: Configure SSH
+        uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_integer_hpu
+          make bench_hlapi_erc20_hpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "hpu_x1" \
+          --backend hpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_integer_benchmarks
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -78,12 +78,12 @@ jobs:

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -128,14 +128,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -193,8 +196,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -89,14 +89,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -150,8 +153,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -78,12 +78,12 @@ jobs:

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -128,14 +128,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -185,8 +188,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -61,11 +61,14 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -61,11 +61,14 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -98,14 +98,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -141,7 +144,7 @@ jobs:
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_tfhe_zk_pok
+          name: ${{ github.sha }}_tfhe_zk_pok_${{ env.BENCH_TYPE }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -155,8 +158,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -96,14 +96,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -136,12 +139,16 @@ jobs:

      - name: Install web resources
        run: |
-          make install_${{ matrix.browser }}_browser
-          make install_${{ matrix.browser }}_web_driver
+          make install_"${BROWSER}"_browser
+          make install_"${BROWSER}"_web_driver
+        env:
+          BROWSER: ${{ matrix.browser }}

      - name: Run benchmarks
        run: |
-          make bench_web_js_api_parallel_${{ matrix.browser }}_ci
+          make bench_web_js_api_parallel_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}

      - name: Parse results
        run: |
@@ -188,8 +195,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -92,7 +92,7 @@ jobs:

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -140,14 +140,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -191,7 +194,7 @@ jobs:
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_integer_zk
+          name: ${{ github.sha }}_integer_zk_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -205,8 +208,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -35,7 +35,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -94,5 +94,10 @@ jobs:
        run: |
          make build_tfhe_coverage

+      - name: Run Hpu pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
+        run: |
+          make pcc_hpu
+
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -51,7 +51,7 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -82,7 +82,7 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -51,7 +51,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -77,7 +77,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -3,14 +3,15 @@ name: Check commit and PR compliance
 on:
  pull_request:

-permissions:
-  contents: read
-  pull-requests: read # Permission needed to scan commits in a pull-request
+permissions: {}

 jobs:
  check-commit-pr:
    name: Check commit and PR
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write # Permission needed to scan commits in a pull-request and write issue comment
    steps:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -25,10 +25,10 @@ jobs:

      - name: Get actionlint
        run: |
-          wget "https://github.com/rhysd/actionlint/releases/download/v${{ env.ACTIONLINT_VERSION }}/actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz"
-          echo "${{ env.ACTIONLINT_CHECKSUM }} actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz" > checksum
+          wget "https://github.com/rhysd/actionlint/releases/download/v${ACTIONLINT_VERSION}/actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz"
+          echo "${ACTIONLINT_CHECKSUM} actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz" > checksum
          sha256sum -c checksum
-          tar -xf actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz actionlint
+          tar -xf actionlint_"${ACTIONLINT_VERSION}"_linux_amd64.tar.gz actionlint
          ln -s "$(pwd)/actionlint" /usr/local/bin/

      - name: Lint workflows
@@ -38,9 +38,11 @@ jobs:
      - name: Check workflows security
        run: |
          make check_workflow_security
+        env:
+          GH_TOKEN: ${{ env.CHECKOUT_TOKEN }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@4830be28ce81da52ec70d65c552a7403821d98d4 # v3.0.23
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@fc87bb5b5a97953d987372e74478de634726b3e5 # v3.0.25
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -54,7 +54,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -90,7 +90,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
+        uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -104,7 +104,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
+        uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -66,7 +66,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -3,7 +3,7 @@ name: Close or Merge corresponding PR on the data repo
 # When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.

 env:
-  TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
+  DATA_REPO: zama-ai/tfhe-backward-compat-data
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -16,117 +16,43 @@ on:
  pull_request:
    types: [ closed ]

-# The same pattern is used for jobs that use the github api:
-# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
-# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
-# - "set +e" will make sure we reach the last "echo EOF" even in case of error
-# - "set -o" pipefail makes one line piped command return the error of the first failure
-# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
-# the script will always return 0 because of the "echo EOF".
-
-
-
 permissions: {}

 jobs:
  auto_close_job:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
    runs-on: ubuntu-latest
+    env:
+      GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }}  # Needed for gh CLI commands
    steps:
-    - name: Find corresponding Pull Request in the data repo
+    - name: Fetch PR number
      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'TARGET_REPO_PR<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X GET \
-          -H "Accept: application/vnd.github+json" \
-          -H "X-GitHub-Api-Version: 2022-11-28"  \
-          "${TARGET_REPO_API_URL}"/pulls\?head="${REPO_OWNER}":"${PR_BRANCH}" | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-      env:
-        REPO_OWNER: ${{ github.repository_owner }}
+        PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
+        echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"

    - name: Comment on the PR to indicate the reason of the close
      run: |
-        BODY="'{ \"body\": \"PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}\" }'"
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X POST \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${COMMENTS_URL}" \
-          -d "${BODY}"
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
+        gh pr comment "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
      env:
        REPO: ${{ github.repository }}
        EVENT_NUMBER: ${{ github.event.number }}
-        COMMENTS_URL: ${{ fromJson(env.TARGET_REPO_PR).comments_url }}

    - name: Merge the Pull Request in the data repo
      if: ${{ github.event.pull_request.merged }}
      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X PUT \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${TARGET_REPO_PR_URL}"/merge \
-          -d '{ "merge_method": "rebase" }'
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-      env:
-        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
+        gh pr merge "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --rebase \
+        --delete-branch

    - name: Close the Pull Request in the data repo
      if: ${{ !github.event.pull_request.merged }}
      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X PATCH \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${TARGET_REPO_PR_URL}" \
-          -d '{ "state": "closed" }'
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-      env:
-        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
-
-    - name: Delete the associated branch in the data repo
-      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X DELETE \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${TARGET_REPO_API_URL}"/git/refs/heads/"${PR_BRANCH}"
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
+        gh pr close "${PR_BRANCH}" \
+        --repo "${DATA_REPO}" \
+        --delete-branch

    - name: Slack Notification
      if: ${{ always() && job.status == 'failure' }}
@@ -134,4 +60,4 @@ jobs:
      uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
      env:
        SLACK_COLOR: ${{ job.status }}
-        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
+        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -45,7 +45,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -172,9 +172,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -124,7 +124,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +156,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -79,7 +79,7 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -149,7 +149,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          BIG_TESTS_INSTANCE=FALSE make test_high_level_api_gpu
+          make test_high_level_api_gpu

  slack-notify:
    name: Slack Notification
@@ -161,9 +161,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -72,7 +72,7 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -1,4 +1,4 @@
-# Perfom tfhe-cuda-backend post-commit checks on an AWS instance
+# Perform tfhe-cuda-backend post-commit checks on an AWS instance
 name: Cuda - Post-commit Checks

 env:
@@ -81,16 +81,20 @@ jobs:
        if: env.SECRETS_AVAILABLE == 'false'
        shell: bash
        run: |
-          TOOLKIT_VERSION="$(echo ${{ matrix.cuda }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"${CUDA_KEYRING_PACKAGE}"
          echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
          sha256sum -c checksum
          sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
          sudo apt update
          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -100,17 +104,21 @@ jobs:
          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc" >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Run fmt checks
        run: |
@@ -120,12 +128,17 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Check build with hpu enabled
+        run: |
+          make clippy_gpu_hpu
+
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -144,9 +144,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -158,9 +158,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -130,7 +130,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +156,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -144,9 +144,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -158,9 +158,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -130,7 +130,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +156,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -0,0 +1,73 @@
+# Test tfhe-fft
+name: Cargo Test HLAPI HPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+env:
+  CARGO_TERM_COLOR: always
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+
+permissions: { }
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read
+    outputs:
+      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        with:
+          files_yaml: |
+            hpu:
+              - tfhe/Cargo.toml
+              - Makefile
+              - backends/tfhe-hpu-backend/**
+              - mockups/tfhe-hpu-mockup/**
+
+  cargo-tests-hpu:
+    needs: should-run
+    if: needs.should-run.outputs.hpu_test == 'true'
+    runs-on: large_ubuntu_16
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Install Just
+        run: |
+          cargo install just
+
+      - name: Test HLAPI HPU
+        run: |
+          source setup_hpu.sh
+          just -f mockups/tfhe-hpu-mockup/Justfile  BUILD_PROFILE=release mockup &
+          make HPU_CONFIG=sim test_high_level_api_hpu
+          make HPU_CONFIG=sim test_user_doc_hpu
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -57,7 +57,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -46,7 +46,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -67,7 +67,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -78,19 +78,24 @@ jobs:
          {
            echo "CUDA_PATH=$CUDA_PATH";
            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+            echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
          } >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}
+
      - name: Prepare package
        run: |
          cargo package -p tfhe-cuda-backend
@@ -129,7 +134,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -140,19 +145,23 @@ jobs:
          {
            echo "CUDA_PATH=$CUDA_PATH";
            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+            echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
          } >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Publish crate.io package
        env:
--- a/.github/workflows/make_release_hpu.yml
+++ b/.github/workflows/make_release_hpu.yml
@@ -0,0 +1,105 @@
+name: Publish HPU release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+jobs:
+  verify_tag:
+    uses: ./.github/workflows/verify_tagged_commit.yml
+    secrets:
+      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  package:
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-hpu-backend
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: crate
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
+  publish_release:
+    name: Publish tfhe-hpu-backend Release
+    runs-on: ubuntu-latest
+    needs: [verify_tag, package] # for comparing hashes
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
+
+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
+        env:
+          SLACK_COLOR: failure
+          SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,6 @@ __pycache__
 # First directive is to ignore symlinks
 tests/tfhe-backward-compat-data
 ci/
+
+# In case someone clones the lattice-estimator locally to verify security
+/lattice-estimator
--- a/.lfsconfig
+++ b/.lfsconfig
@@ -0,0 +1,2 @@
+[lfs]
+  fetchexclude = *
--- a/20
+++ b/20
@@ -1,12 +1,28 @@
 # Specifying a path without code owners means that path won't have owners and is akin to a negation
 # i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
 # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
+
+/backends/tfhe-cuda-backend/            @agnesLeroy
+/backends/tfhe-hpu-backend/             @zama-ai/hardware
+
+/tfhe/examples/hpu                      @zama-ai/hardware
+
 /tfhe/src/core_crypto/                  @IceTDrinker
-/tfhe/src/core_crypto/gpu
+/tfhe/src/core_crypto/gpu               @agnesLeroy
+/tfhe/src/core_crypto/hpu               @zama-ai/hardware

 /tfhe/src/shortint/                     @mayeul-zama

 /tfhe/src/integer/                      @tmontaigu
-/tfhe/src/integer/gpu
+/tfhe/src/integer/gpu                   @agnesLeroy
+/tfhe/src/integer/hpu                   @zama-ai/hardware

 /tfhe/src/high_level_api/               @tmontaigu
+
+/Makefile                               @IceTDrinker @soonum
+
+/mockups/tfhe-hpu-mockup                @zama-ai/hardware
+
+/.github/                               @soonum
+
+/CODEOWNERS                             @IceTDrinker
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,10 +9,12 @@ members = [
    "tasks",
    "tfhe-csprng",
    "backends/tfhe-cuda-backend",
+    "backends/tfhe-hpu-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/param_dedup",
    "tests",
+    "mockups/tfhe-hpu-mockup",
 ]

 exclude = [
--- a/126
+++ b/126
@@ -2,6 +2,7 @@ SHELL:=$(shell /usr/bin/env which bash)
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
+CARGO_BUILD_JOBS=default
 CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -55,6 +56,9 @@ REGEX_PATTERN?=''
 TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

+# tfhe-hpu-backend
+HPU_CONFIG=v80
+
 # Exclude these files from coverage reports
 define COVERAGE_EXCLUDED_FILES
 --exclude-files apps/trivium/src/trivium/* \
@@ -166,9 +170,13 @@ install_typos_checker: install_rs_build_toolchain
 .PHONY: install_zizmor # Install zizmor workflow security checker
 install_zizmor: install_rs_build_toolchain
 	@zizmor --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
 	( echo "Unable to install zizmor, unknown error." && exit 1 )

+.PHONY: install_cargo_cross # Install custom tfhe-rs lints
+install_cargo_cross: install_rs_build_toolchain
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
+
 .PHONY: setup_venv # Setup Python virtualenv for wasm tests
 setup_venv:
 	python3 -m venv venv
@@ -290,7 +298,7 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

@@ -301,6 +309,20 @@ check_gpu: install_rs_check_toolchain
 		--all-targets \
 		-p $(TFHE_SPEC)

+.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
+clippy_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
+		--all-targets \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+
+.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
+clippy_gpu_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
+		--all-targets \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -440,6 +462,8 @@ clippy_tfhe_csprng: install_rs_check_toolchain
 clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings

 .PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
 clippy_versionable: install_rs_check_toolchain
@@ -473,6 +497,11 @@ clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings

+.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
+clippy_hpu_backend: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-hpu-backend -- --no-deps -D warnings
+
 .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
 check_rust_bindings_did_not_change:
 	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -702,6 +731,28 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

+.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
+test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
+	cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
+
+.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
+test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
+	source ./setup_hpu.sh --config sim ; \
+	cargo build --release --bin hpu_mockup; \
+    coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
+	HPU_TEST_ITER=1 \
+	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	kill %1
+
+.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
+test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
+	source ./setup_hpu.sh --config sim ; \
+	cargo build --profile devo --bin hpu_mockup; \
+    coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
+	HPU_TEST_ITER=1 \
+	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	kill %1
+
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -854,9 +905,25 @@ test_high_level_api: install_rs_build_toolchain

 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
 		-E "test(/high_level_api::.*gpu.*/)"

+test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
+ifeq ($(HPU_CONFIG), v80)
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--build-jobs=$(CARGO_BUILD_JOBS) \
+		--test-threads=1 \
+		--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*hpu.*/)"
+else
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--build-jobs=$(CARGO_BUILD_JOBS) \
+		--test-threads=1 \
+		--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*hpu.*/)"
+endif
+
+
 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -874,9 +941,21 @@ test_user_doc: install_rs_build_toolchain
 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
+		--features=internal-keycache,integer,zk-pok,gpu -p $(TFHE_SPEC) \
 		-- test_user_docs::

+.PHONY: test_user_doc_hpu # Run tests for HPU from the .md documentation
+test_user_doc_hpu: install_rs_build_toolchain
+ifeq ($(HPU_CONFIG), v80)
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=internal-keycache,integer,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		-- test_user_docs::
+else
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=internal-keycache,integer,hpu -p $(TFHE_SPEC) \
+		-- test_user_docs::
+endif
+


 .PHONY: test_regex_engine # Run tests for regex_engine example
@@ -907,10 +986,16 @@ test_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng

+.PHONY: test_tfhe_csprng_big_endian # Run tfhe-csprng tests on an emulated big endian system
+test_tfhe_csprng_big_endian: install_rs_build_toolchain install_cargo_cross
+	RUSTFLAGS="" cross $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
+
+
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe-zk-pok
+		-p tfhe-zk-pok --features experimental

 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
@@ -1012,7 +1097,7 @@ check_compile_tests: install_rs_build_toolchain
 .PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
 check_compile_tests_benches_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=experimental,boolean,shortint,integer,internal-keycache,gpu \
+		--features=experimental,boolean,shortint,integer,internal-keycache,gpu,zk-pok \
 		-p $(TFHE_SPEC)
 	mkdir -p "$(TFHECUDA_BUILD)" && \
 		cd "$(TFHECUDA_BUILD)" && \
@@ -1100,6 +1185,12 @@ clippy_bench_gpu: install_rs_check_toolchain
 		--features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings

+.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark
+clippy_bench_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--features=hpu,shortint,integer,internal-keycache,pbs-stats\
+		-p tfhe-benchmark -- --no-deps -D warnings
+
 .PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
 print_doc_bench_parameters:
 	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
@@ -1133,6 +1224,14 @@ bench_signed_integer_gpu: install_rs_check_toolchain
 	--bench integer-signed-bench \
 	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

+.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
+bench_integer_hpu: install_rs_check_toolchain
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-bench \
+	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
+
 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1146,7 +1245,7 @@ bench_integer_compression_gpu: install_rs_check_toolchain
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
-	
+
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1324,6 +1423,14 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --

+.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc20_hpu: install_rs_check_toolchain
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
+
 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
@@ -1384,7 +1491,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain
 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
 write_params_to_file: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
-	--example write_params_to_file --features=boolean,shortint,internal-keycache
+	--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache

 .PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
 clone_backward_compat_data:
@@ -1423,6 +1530,9 @@ tfhe_lints
 pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
 clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu

+.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
+pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast
+
 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
 check_md_docs_are_tested clippy_fast check_compile_tests
--- a/_typos.toml
+++ b/_typos.toml
@@ -11,11 +11,13 @@ extend-ignore-identifiers-re = [
    # Example with string replacing "hello" with "herlo"
    "herlo",
    # Example in trivium
-    "C9217BA0D762ACA1"
+    "C9217BA0D762ACA1",
+    "0x[0-9a-fA-F]+"
 ]

 [files]
 extend-exclude = [
    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
+    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
 ]
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -137,10 +137,10 @@ its setup a little more intricate.
 Example code:
 ```rust
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,9 +1,9 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,9 +1,9 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,

 void cuda_improve_noise_modulus_switch_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus);
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus);

 void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -24,7 +24,15 @@ using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
      return std::get<Torus *>(variant);                                       \
    }                                                                          \
  }()
-
+// Macro to define the visitor logic using std::holds_alternative for vectors
+#define GET_VARIANT_ELEMENT_64BIT(variant, index)                              \
+  [&] {                                                                        \
+    if (std::holds_alternative<std::vector<uint64_t *>>(variant)) {            \
+      return std::get<std::vector<uint64_t *>>(variant)[index];                \
+    } else {                                                                   \
+      return std::get<uint64_t *>(variant);                                    \
+    }                                                                          \
+  }()
 int get_active_gpu_count(int num_inputs, int gpu_count);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -400,8 +400,9 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
+    CudaRadixCiphertextFFI *radix_lwe_vec,
+    bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
@@ -414,7 +415,8 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
+    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
+    bool allocate_ms_array);

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -538,5 +540,100 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);

+void extend_radix_with_trivial_zero_blocks_msb_64(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    void *const *streams, uint32_t const *gpu_indexes);
+
+void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              void *const *streams,
+                              uint32_t const *gpu_indexes);
+
+uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
+    uint64_t rhs, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars);
+
+void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t num_original_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output_radix_lwe,
+    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks);
+
+void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
+                                           uint32_t const *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
+    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry);
+
+void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
+                                                 uint32_t const *gpu_indexes,
+                                                 uint32_t gpu_count,
+                                                 int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t num_additional_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
+
+void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
+                                                uint32_t const *gpu_indexes,
+                                                uint32_t gpu_count,
+                                                int8_t **mem_ptr_void);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -6,6 +6,8 @@
 #include "integer/radix_ciphertext.h"
 #include "keyswitch/keyswitch.h"
 #include "pbs/programmable_bootstrap.cuh"
+#include "pbs/programmable_bootstrap_128.cuh"
+#include "utils/helper_multi_gpu.cuh"
 #include <cmath>
 #include <functional>

@@ -249,7 +251,6 @@ template <typename Torus> struct int_radix_lut {

    h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
    h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
-
    for (int i = 0; i < num_radix_blocks; i++)
      h_lwe_indexes_in[i] = i;

@@ -528,10 +529,10 @@ template <typename Torus> struct int_radix_lut {
  }

  // Return a pointer to idx-ith degree
-  Torus *get_degree(size_t idx) { return &degrees[num_many_lut * idx]; }
+  uint64_t *get_degree(size_t idx) { return &degrees[num_many_lut * idx]; }

  // Return a pointer to idx-ith max degree
-  Torus *get_max_degree(size_t idx) { return &max_degrees[idx]; }
+  uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; }

  // Return a pointer to idx-ith lut indexes at gpu_index's global memory
  Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
@@ -580,6 +581,7 @@ template <typename Torus> struct int_radix_lut {
            streams[i], gpu_indexes[i], gpu_memory_allocated);
      }
    }
+    cuda_set_device(gpu_indexes[0]);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -646,6 +648,206 @@ template <typename Torus> struct int_radix_lut {
    free(max_degrees);
  }
 };
+
+template <typename InputTorus> struct int_noise_squashing_lut {
+
+  int_radix_params params;
+  uint32_t input_glwe_dimension;
+  uint32_t input_polynomial_size;
+  uint32_t input_big_lwe_dimension;
+  uint32_t num_blocks;
+  // Tracks the degree of each LUT and the max degree on CPU
+  // The max degree is (message_modulus * carry_modulus - 1) except for many lut
+  // for which it's different
+  uint64_t *degrees;
+  uint64_t *max_degrees;
+
+  int active_gpu_count;
+
+  // There will be one buffer on each GPU in multi-GPU computations
+  // (same for tmp lwe arrays)
+  std::vector<int8_t *> pbs_buffer;
+
+  std::vector<__uint128_t *> lut_vec;
+
+  uint32_t *gpu_indexes;
+  CudaRadixCiphertextFFI *tmp_lwe_before_ks;
+
+  // All tmp lwe arrays and index arrays for lwe contain the total
+  // amount of blocks to be computed on, there is no split between GPUs
+  // for the moment
+  InputTorus *lwe_indexes_in;
+
+  InputTorus *h_lwe_indexes_in;
+  InputTorus *h_lwe_indexes_out;
+  InputTorus *lwe_trivial_indexes;
+
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<InputTorus *> lwe_array_in_vec;
+  std::vector<InputTorus *> lwe_after_ks_vec;
+  std::vector<__uint128_t *> lwe_after_pbs_vec;
+  std::vector<InputTorus *> lwe_trivial_indexes_vec;
+
+  bool using_trivial_lwe_indexes = true;
+  bool gpu_memory_allocated;
+  // noise squashing constructor
+  int_noise_squashing_lut(cudaStream_t const *streams,
+                          uint32_t const *input_gpu_indexes, uint32_t gpu_count,
+                          int_radix_params params,
+                          uint32_t input_glwe_dimension,
+                          uint32_t input_polynomial_size,
+                          uint32_t num_radix_blocks,
+                          uint32_t original_num_blocks,
+                          bool allocate_gpu_memory, uint64_t *size_tracker) {
+    this->params = params;
+    this->num_blocks = num_radix_blocks;
+    gpu_memory_allocated = allocate_gpu_memory;
+    // This are the glwe dimension and polynomial size before squashing
+    this->input_glwe_dimension = input_glwe_dimension;
+    this->input_polynomial_size = input_polynomial_size;
+    uint32_t input_big_lwe_dimension =
+        input_glwe_dimension * input_polynomial_size;
+    this->input_big_lwe_dimension = input_big_lwe_dimension;
+
+    uint32_t lut_buffer_size = (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(__uint128_t);
+
+    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
+    std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
+
+    ///////////////
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_set_device(i);
+      auto num_radix_blocks_on_gpu =
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      int8_t *gpu_pbs_buffer;
+      uint64_t size = 0;
+      execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
+                              params.small_lwe_dimension, params.glwe_dimension,
+                              params.polynomial_size, params.pbs_level,
+                              num_radix_blocks_on_gpu, allocate_gpu_memory,
+                              params.allocate_ms_array, &size);
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      if (i == 0 && size_tracker != nullptr) {
+        *size_tracker += size;
+      }
+      pbs_buffer.push_back(gpu_pbs_buffer);
+    }
+    lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    h_lwe_indexes_in =
+        (InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
+    for (int i = 0; i < num_radix_blocks; i++)
+      h_lwe_indexes_in[i] = i;
+
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus),
+        streams[0], gpu_indexes[0], allocate_gpu_memory);
+    cuda_memcpy_with_size_tracking_async_to_gpu(
+        lwe_trivial_indexes, h_lwe_indexes_in,
+        num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
+        allocate_gpu_memory);
+
+    multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                              lwe_array_in_vec, num_radix_blocks,
+                              params.big_lwe_dimension + 1, size_tracker,
+                              allocate_gpu_memory);
+
+    multi_gpu_alloc_lwe_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
+        num_radix_blocks, params.small_lwe_dimension + 1, size_tracker,
+        allocate_gpu_memory);
+    multi_gpu_alloc_lwe_async<__uint128_t>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        num_radix_blocks, params.big_lwe_dimension + 1, size_tracker,
+        allocate_gpu_memory);
+    multi_gpu_alloc_array_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
+        num_radix_blocks, size_tracker, allocate_gpu_memory);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
+                               lwe_trivial_indexes_vec, lwe_trivial_indexes,
+                               num_radix_blocks, allocate_gpu_memory);
+    if (allocate_gpu_memory) {
+      // Allocate LUT
+      // LUT is used as a trivial encryption and must be initialized outside
+      // this constructor
+      for (uint i = 0; i < active_gpu_count; i++) {
+        auto lut = (__uint128_t *)cuda_malloc_with_size_tracking_async(
+            lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
+            allocate_gpu_memory);
+        lut_vec.push_back(lut);
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+    }
+    // Keyswitch
+    tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<InputTorus>(
+        streams[0], gpu_indexes[0], tmp_lwe_before_ks, original_num_blocks,
+        input_big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    degrees = (uint64_t *)malloc(sizeof(uint64_t));
+    max_degrees = (uint64_t *)malloc(sizeof(uint64_t));
+
+    // lut for the squashing
+    auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };
+
+    // Generate the identity LUT, for now we only use one GPU
+    for (uint i = 0; i < active_gpu_count; i++) {
+      auto squash_lut = lut_vec[i];
+      generate_device_accumulator<__uint128_t>(
+          streams[i], gpu_indexes[i], squash_lut, degrees, max_degrees,
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, f_squash, allocate_gpu_memory);
+    }
+  }
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    free(this->gpu_indexes);
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_drop_with_size_tracking_async(lut_vec[i], streams[i], gpu_indexes[i],
+                                         gpu_memory_allocated);
+    }
+    cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    lut_vec.clear();
+    free(h_lwe_indexes_in);
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   tmp_lwe_before_ks, gpu_memory_allocated);
+    for (int i = 0; i < pbs_buffer.size(); i++) {
+      cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
+                                              &pbs_buffer[i]);
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+
+    multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
+    multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
+    multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
+    multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
+    for (uint i = 0; i < active_gpu_count; i++)
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    lwe_array_in_vec.clear();
+    lwe_after_ks_vec.clear();
+    lwe_after_pbs_vec.clear();
+    lwe_trivial_indexes_vec.clear();
+
+    delete tmp_lwe_before_ks;
+    pbs_buffer.clear();
+  }
+};
+
 template <typename Torus> struct int_bit_extract_luts_buffer {
  int_radix_params params;
  int_radix_lut<Torus> *lut;
@@ -1076,9 +1278,10 @@ template <typename Torus> struct int_overflowing_sub_memory {
        luts_array->get_degree(1), luts_array->get_max_degree(1),
        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        f_lut_does_block_generate_or_propagate, gpu_memory_allocated);
-    cuda_set_value_async<Torus>(streams[0], gpu_indexes[0],
-                                luts_array->get_lut_indexes(0, 1), 1,
-                                num_radix_blocks - 1);
+    if (allocate_gpu_memory)
+      cuda_set_value_async<Torus>(streams[0], gpu_indexes[0],
+                                  luts_array->get_lut_indexes(0, 1), 1,
+                                  num_radix_blocks - 1);

    generate_device_accumulator_bivariate<Torus>(
        streams[0], gpu_indexes[0], luts_borrow_propagation_sum->get_lut(0, 0),
@@ -1116,18 +1319,123 @@ template <typename Torus> struct int_overflowing_sub_memory {
 };

 template <typename Torus> struct int_sum_ciphertexts_vec_memory {
-  CudaRadixCiphertextFFI *new_blocks;
-  CudaRadixCiphertextFFI *new_blocks_copy;
-  CudaRadixCiphertextFFI *old_blocks;
-  CudaRadixCiphertextFFI *small_lwe_vector;
+
  int_radix_params params;
-
-  int32_t *d_smart_copy_in;
-  int32_t *d_smart_copy_out;
-
-  bool mem_reuse = false;
+  uint32_t max_total_blocks_in_vec;
+  uint32_t num_blocks_in_radix;
+  uint32_t max_num_radix_in_vec;
+  uint32_t chunk_size;
+  uint64_t *size_tracker;
  bool gpu_memory_allocated;

+  // temporary buffers
+  CudaRadixCiphertextFFI *current_blocks;
+  CudaRadixCiphertextFFI *small_lwe_vector;
+
+  uint32_t *d_columns_data;
+  uint32_t *d_columns_counter;
+  uint32_t **d_columns;
+
+  uint32_t *d_new_columns_data;
+  uint32_t *d_new_columns_counter;
+  uint32_t **d_new_columns;
+
+  uint64_t *d_degrees;
+
+  // lookup table for extracting message and carry
+  int_radix_lut<Torus> *luts_message_carry;
+
+  bool mem_reuse = false;
+  bool allocated_luts_message_carry;
+
+  void setup_index_buffers(cudaStream_t const *streams,
+                           uint32_t const *gpu_indexes) {
+
+    d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
+        max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
+        size_tracker, gpu_memory_allocated);
+
+    auto num_blocks_in_radix = this->num_blocks_in_radix;
+    auto max_num_radix_in_vec = this->max_num_radix_in_vec;
+    auto setup_columns =
+        [num_blocks_in_radix, max_num_radix_in_vec, streams,
+         gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
+                      uint32_t *&columns_counter, uint64_t *size_tracker,
+                      bool gpu_memory_allocated) {
+          columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
+              num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
+              streams[0], gpu_indexes[0], size_tracker, gpu_memory_allocated);
+          columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
+              num_blocks_in_radix * sizeof(uint32_t), streams[0],
+              gpu_indexes[0], size_tracker, gpu_memory_allocated);
+          cuda_memset_with_size_tracking_async(
+              columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
+              streams[0], gpu_indexes[0], gpu_memory_allocated);
+          uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
+          for (int i = 0; i < num_blocks_in_radix; ++i) {
+            h_columns[i] = columns_data + i * max_num_radix_in_vec;
+          }
+          columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
+              num_blocks_in_radix * sizeof(uint32_t *), streams[0],
+              gpu_indexes[0], size_tracker, gpu_memory_allocated);
+          if (gpu_memory_allocated) {
+            cuda_memcpy_async_to_gpu(columns, h_columns,
+                                     num_blocks_in_radix * sizeof(uint32_t *),
+                                     streams[0], gpu_indexes[0]);
+          }
+          cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+          delete[] h_columns;
+        };
+
+    setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
+                  gpu_memory_allocated);
+    setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
+                  size_tracker, gpu_memory_allocated);
+  }
+
+  void setup_lookup_tables(cudaStream_t const *streams,
+                           uint32_t const *gpu_indexes, uint32_t gpu_count) {
+    uint32_t message_modulus = params.message_modulus;
+
+    if (!mem_reuse) {
+      uint32_t pbs_count = std::max(2 * (max_total_blocks_in_vec / chunk_size),
+                                    2 * num_blocks_in_radix);
+      if (max_total_blocks_in_vec > 0) {
+        luts_message_carry = new int_radix_lut<Torus>(
+            streams, gpu_indexes, gpu_count, params, 2, pbs_count,
+            gpu_memory_allocated, size_tracker);
+      } else {
+        allocated_luts_message_carry = false;
+      }
+    }
+    if (allocated_luts_message_carry) {
+      auto message_acc = luts_message_carry->get_lut(0, 0);
+      auto carry_acc = luts_message_carry->get_lut(0, 1);
+
+      // define functions for each accumulator
+      auto lut_f_message = [message_modulus](Torus x) -> Torus {
+        return x % message_modulus;
+      };
+      auto lut_f_carry = [message_modulus](Torus x) -> Torus {
+        return x / message_modulus;
+      };
+
+      // generate accumulators
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], message_acc,
+          luts_message_carry->get_degree(0),
+          luts_message_carry->get_max_degree(0), params.glwe_dimension,
+          params.polynomial_size, message_modulus, params.carry_modulus,
+          lut_f_message, gpu_memory_allocated);
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], carry_acc,
+          luts_message_carry->get_degree(1),
+          luts_message_carry->get_max_degree(1), params.glwe_dimension,
+          params.polynomial_size, message_modulus, params.carry_modulus,
+          lut_f_carry, gpu_memory_allocated);
+      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+    }
+  }
  int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
                                 uint32_t const *gpu_indexes,
                                 uint32_t gpu_count, int_radix_params params,
@@ -1136,103 +1444,87 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
                                 bool allocate_gpu_memory,
                                 uint64_t *size_tracker) {
    this->params = params;
-    gpu_memory_allocated = allocate_gpu_memory;
+    this->mem_reuse = false;
+    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
+    this->num_blocks_in_radix = num_blocks_in_radix;
+    this->max_num_radix_in_vec = max_num_radix_in_vec;
+    this->gpu_memory_allocated = allocate_gpu_memory;
+    this->size_tracker = size_tracker;
+    this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
+                       (params.message_modulus - 1);
+    this->allocated_luts_message_carry = true;
+    setup_index_buffers(streams, gpu_indexes);
+    setup_lookup_tables(streams, gpu_indexes, gpu_count);

-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
-
-    // allocate gpu memory for intermediate buffers
-    new_blocks = new CudaRadixCiphertextFFI;
+    // create and allocate intermediate buffers
+    current_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-    new_blocks_copy = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-    old_blocks = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
+        streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
+        streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
  }

  int_sum_ciphertexts_vec_memory(
      cudaStream_t const *streams, uint32_t const *gpu_indexes,
      uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
-      uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
-      CudaRadixCiphertextFFI *old_blocks,
-      CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
+      uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
+      CudaRadixCiphertextFFI *small_lwe_vector,
+      int_radix_lut<Torus> *reused_lut, bool allocate_gpu_memory,
      uint64_t *size_tracker) {
-    mem_reuse = true;
-    gpu_memory_allocated = allocate_gpu_memory;
+    this->mem_reuse = true;
    this->params = params;
+    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
+    this->num_blocks_in_radix = num_blocks_in_radix;
+    this->max_num_radix_in_vec = max_num_radix_in_vec;
+    this->gpu_memory_allocated = allocate_gpu_memory;
+    this->size_tracker = size_tracker;
+    this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
+                       (params.message_modulus - 1);
+    this->allocated_luts_message_carry = true;

-    int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
-
-    // assign  gpu memory for intermediate buffers
-    this->new_blocks = new_blocks;
-    this->old_blocks = old_blocks;
+    this->current_blocks = current_blocks;
    this->small_lwe_vector = small_lwe_vector;
-    new_blocks_copy = new CudaRadixCiphertextFFI;
-    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
-
-    d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
-        max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
-        size_tracker, allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
-    cuda_memset_with_size_tracking_async(
-        d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
+    this->luts_message_carry = reused_lut;
+    setup_index_buffers(streams, gpu_indexes);
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
-    cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
+    cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
+    cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
+                                       gpu_memory_allocated);
+
+    cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);

    if (!mem_reuse) {
-      release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
-                                     gpu_memory_allocated);
-      release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
                                     gpu_memory_allocated);
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     small_lwe_vector, gpu_memory_allocated);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      delete new_blocks;
-      delete old_blocks;
+      if (allocated_luts_message_carry) {
+        luts_message_carry->release(streams, gpu_indexes, gpu_count);
+        delete luts_message_carry;
+      }
+
+      delete current_blocks;
      delete small_lwe_vector;
    }
-    release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
-                                   gpu_memory_allocated);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    delete new_blocks_copy;
  }
 };
+
 // For sequential algorithm in group propagation
 template <typename Torus> struct int_seq_group_prop_memory {

@@ -2549,7 +2841,7 @@ template <typename Torus> struct int_mul_memory {
    // radix_lwe_left except the last blocks of each shift
    int msb_vector_block_count = num_radix_blocks * (num_radix_blocks - 1) / 2;

-    int total_block_count = lsb_vector_block_count + msb_vector_block_count;
+    int total_block_count = num_radix_blocks * num_radix_blocks;

    // allocate memory for intermediate buffers
    vector_result_sb = new CudaRadixCiphertextFFI;
@@ -2562,13 +2854,13 @@ template <typename Torus> struct int_mul_memory {
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
+        streams[0], gpu_indexes[0], small_lwe_vector, 2 * total_block_count,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);

    // create int_radix_lut objects for lsb, msb, message, carry
    // luts_array -> lut = {lsb_acc, msb_acc}
    luts_array = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
-                                          params, 2, total_block_count,
+                                          params, 2, 2 * total_block_count,
                                          allocate_gpu_memory, size_tracker);
    auto lsb_acc = luts_array->get_lut(0, 0);
    auto msb_acc = luts_array->get_lut(0, 1);
@@ -2595,16 +2887,17 @@ template <typename Torus> struct int_mul_memory {
    // first lsb_vector_block_count value should reference to lsb_acc
    // last msb_vector_block_count values should reference to msb_acc
    // for message and carry default lut_indexes_vec is fine
-    cuda_set_value_async<Torus>(
-        streams[0], gpu_indexes[0],
-        luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
-        msb_vector_block_count);
+    if (allocate_gpu_memory)
+      cuda_set_value_async<Torus>(
+          streams[0], gpu_indexes[0],
+          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
+          msb_vector_block_count);

    luts_array->broadcast_lut(streams, gpu_indexes, 0);
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-        2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
+        2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
        allocate_gpu_memory, size_tracker);
    uint32_t uses_carry = 0;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -2750,9 +3043,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
    tmp_rotated = pre_allocated_buffer;
    reuse_memory = true;

-    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                                 tmp_rotated, 0,
-                                                 tmp_rotated->num_radix_blocks);
+    if (allocate_gpu_memory)
+      set_zero_radix_ciphertext_slice_async<Torus>(
+          streams[0], gpu_indexes[0], tmp_rotated, 0,
+          tmp_rotated->num_radix_blocks);

    uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);

@@ -3918,7 +4212,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_did_not_happen[0]->get_degree(0),
        zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, 2, gpu_memory_allocated);
+        params.carry_modulus, cur_lut_f, params.message_modulus - 2,
+        gpu_memory_allocated);
    zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
                                                          0);
    generate_device_accumulator_bivariate_with_factor<Torus>(
@@ -3927,7 +4222,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_did_not_happen[1]->get_degree(0),
        zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, cur_lut_f, 3, gpu_memory_allocated);
+        params.carry_modulus, cur_lut_f, params.message_modulus - 1,
+        gpu_memory_allocated);
    zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
                                                          0);

@@ -3954,7 +4250,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_happened[0]->get_degree(0),
        zero_out_if_overflow_happened[0]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, 2, gpu_memory_allocated);
+        params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
+        gpu_memory_allocated);
    zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes, 0);
    generate_device_accumulator_bivariate_with_factor<Torus>(
        streams[0], gpu_indexes[0],
@@ -3962,7 +4259,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
        zero_out_if_overflow_happened[1]->get_degree(0),
        zero_out_if_overflow_happened[1]->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, overflow_happened_f, 3, gpu_memory_allocated);
+        params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
+        gpu_memory_allocated);
    zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes, 0);

    // merge_overflow_flags_luts
@@ -4378,26 +4676,28 @@ template <typename Torus> struct int_scalar_mul_buffer {
  int_sc_prop_memory<Torus> *sc_prop_mem;
  bool anticipated_buffers_drop;
  bool gpu_memory_allocated;
+  uint32_t num_ciphertext_bits;

  int_scalar_mul_buffer(cudaStream_t const *streams,
                        uint32_t const *gpu_indexes, uint32_t gpu_count,
                        int_radix_params params, uint32_t num_radix_blocks,
-                        bool allocate_gpu_memory, bool anticipated_buffer_drop,
-                        uint64_t *size_tracker) {
+                        uint32_t num_scalar_bits, bool allocate_gpu_memory,
+                        bool anticipated_buffer_drop, uint64_t *size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    this->anticipated_buffers_drop = anticipated_buffer_drop;

    uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus);
-    size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
+    num_ciphertext_bits = msg_bits * num_scalar_bits;

    //// Contains all shifted values of lhs for shift in range (0..msg_bits)
    //// The idea is that with these we can create all other shift that are
    /// in / range (0..total_bits) for free (block rotation)
    preshifted_buffer = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
-        streams[0], gpu_indexes[0], preshifted_buffer, num_ciphertext_bits,
-        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+        streams[0], gpu_indexes[0], preshifted_buffer,
+        msg_bits * num_radix_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);

    all_shifted_buffer = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -4414,9 +4714,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
          streams, gpu_indexes, gpu_count, LEFT_SHIFT, params, num_radix_blocks,
          allocate_gpu_memory, size_tracker);

-    sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
-        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-        num_ciphertext_bits, allocate_gpu_memory, size_tracker);
+    if (num_ciphertext_bits > 0) {
+      sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
+          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+          num_ciphertext_bits, allocate_gpu_memory, size_tracker);
+    }
    uint32_t uses_carry = 0;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
    sc_prop_mem = new int_sc_prop_memory<Torus>(
@@ -4428,9 +4730,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
               uint32_t gpu_count) {
    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                   all_shifted_buffer, gpu_memory_allocated);
-    sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
+    if (num_ciphertext_bits > 0) {
+      sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
+      delete sum_ciphertexts_vec_mem;
+    }
    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
-    delete sum_ciphertexts_vec_mem;
    delete sc_prop_mem;
    delete all_shifted_buffer;
    if (!anticipated_buffers_drop) {
@@ -4686,6 +4990,169 @@ template <typename Torus> struct int_div_rem_memory {
  }
 };

+template <typename Torus> struct int_scalar_mul_high {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
+  int_scalar_mul_buffer<Torus> *scalar_mul_mem;
+
+  CudaRadixCiphertextFFI *tmp;
+
+  int_scalar_mul_high(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                      uint32_t gpu_count, const int_radix_params params,
+                      uint32_t num_radix_blocks, const bool allocate_gpu_memory,
+                      SHIFT_OR_ROTATE_TYPE shift_type, uint32_t num_scalar_bits,
+                      bool anticipated_buffer_drop, uint64_t *size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, shift_type, params,
+        2 * num_radix_blocks, allocate_gpu_memory, size_tracker);
+
+    this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, params, 2 * num_radix_blocks,
+        num_scalar_bits, allocate_gpu_memory, anticipated_buffer_drop,
+        size_tracker);
+
+    this->tmp = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], tmp, 2 * num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    logical_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
+    delete logical_scalar_shift_mem;
+
+    scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
+    delete scalar_mul_mem;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp,
+                                   allocate_gpu_memory);
+    delete tmp;
+  }
+};
+
+template <typename Torus> struct int_sub_and_propagate {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  CudaRadixCiphertextFFI *neg_rhs_array;
+
+  int_sc_prop_memory<Torus> *sc_prop_mem;
+
+  int_sub_and_propagate(cudaStream_t const *streams,
+                        uint32_t const *gpu_indexes, uint32_t gpu_count,
+                        const int_radix_params params,
+                        uint32_t num_radix_blocks, uint32_t requested_flag_in,
+                        bool allocate_gpu_memory, uint64_t *size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->sc_prop_mem = new int_sc_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+        requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
+
+    this->neg_rhs_array = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
+    delete sc_prop_mem;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
+                                   allocate_gpu_memory);
+    delete neg_rhs_array;
+  }
+};
+
+template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
+
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  int_radix_lut<Torus> *lut;
+
+  CudaRadixCiphertextFFI *last_block;
+  CudaRadixCiphertextFFI *padding_block;
+
+  int_extend_radix_with_sign_msb_buffer(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, const int_radix_params params,
+      uint32_t num_radix_blocks, uint32_t num_additional_blocks,
+      const bool allocate_gpu_memory, uint64_t *size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->lut = nullptr;
+    this->last_block = nullptr;
+    this->padding_block = nullptr;
+
+    if (num_additional_blocks != 0) {
+      this->lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
+                                           params, 1, num_radix_blocks,
+                                           allocate_gpu_memory, size_tracker);
+
+      uint32_t bits_per_block = std::log2(params.message_modulus);
+      uint32_t msg_modulus = params.message_modulus;
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
+          lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size,
+          params.message_modulus, params.carry_modulus,
+          [msg_modulus, bits_per_block](Torus x) {
+            const auto xm = x % msg_modulus;
+            const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
+            return (Torus)((msg_modulus - 1) * sign_bit);
+          },
+          allocate_gpu_memory);
+
+      this->last_block = new CudaRadixCiphertextFFI;
+
+      create_zero_radix_ciphertext_async<Torus>(
+          streams[0], gpu_indexes[0], last_block, 1, params.big_lwe_dimension,
+          size_tracker, allocate_gpu_memory);
+
+      this->padding_block = new CudaRadixCiphertextFFI;
+
+      create_zero_radix_ciphertext_async<Torus>(
+          streams[0], gpu_indexes[0], padding_block, 1,
+          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+    }
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    if (lut != nullptr) {
+      lut->release(streams, gpu_indexes, gpu_count);
+      delete lut;
+    }
+    if (last_block != nullptr) {
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], last_block,
+                                     allocate_gpu_memory);
+      delete last_block;
+    }
+    if (padding_block != nullptr) {
+      release_radix_ciphertext_async(streams[0], gpu_indexes[0], padding_block,
+                                     allocate_gpu_memory);
+      delete padding_block;
+    }
+  }
+};
+
 void update_degrees_after_bitand(uint64_t *output_degrees,
                                 uint64_t *lwe_array_1_degrees,
                                 uint64_t *lwe_array_2_degrees,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
@@ -0,0 +1,13 @@
+#ifndef CUDA_BOOTSTRAP_128_H
+#define CUDA_BOOTSTRAP_128_H
+
+#include "pbs_enums.h"
+#include <stdint.h>
+
+uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+#endif // CUDA_BOOTSTRAP_128_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -240,14 +240,13 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

-template <PBS_TYPE pbs_type> struct pbs_buffer_128;
-
-template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
+template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
  double *global_join_buffer;
-  __uint128_t *temp_lwe_array_in;
+  InputTorus *temp_lwe_array_in;
+  uint64_t *trivial_indexes;

  PBS_VARIANT pbs_variant;
  bool uses_noise_reduction;
@@ -263,11 +262,25 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
    this->uses_noise_reduction = allocate_ms_array;
-    this->temp_lwe_array_in =
-        (__uint128_t *)cuda_malloc_with_size_tracking_async(
-            (lwe_dimension + 1) * input_lwe_ciphertext_count *
-                sizeof(__uint128_t),
-            stream, gpu_index, size_tracker, allocate_ms_array);
+    if (allocate_ms_array) {
+      this->temp_lwe_array_in = (InputTorus *)cuda_malloc_async(
+          (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(InputTorus),
+          stream, gpu_index);
+      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          size_tracker, allocate_ms_array);
+      uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
+      for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
+        h_trivial_indexes[i] = i;
+
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          trivial_indexes, h_trivial_indexes,
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          allocate_gpu_memory);
+
+      cuda_synchronize_stream(stream, gpu_index);
+      delete[] h_trivial_indexes;
+    }
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
    size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
                                     input_lwe_ciphertext_count *
@@ -404,9 +417,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);

-    if (uses_noise_reduction)
+    if (uses_noise_reduction) {
      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
+                                         gpu_memory_allocated);
+    }
  }
 };

@@ -502,7 +518,12 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
                                                    uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
-                                                    uint32_t level_count);
+                                                    uint32_t level_count,
+                                                    uint32_t max_shared_memory);
+
+bool has_support_to_cuda_programmable_bootstrap_128_cg(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples, uint32_t max_shared_memory);

 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -100,7 +100,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
    void const *lut_vector, void const *lwe_array_in,
    void const *bootstrapping_key,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
+    void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples);

--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -112,15 +112,15 @@ template <typename Torus> struct zk_expand_mem {

    // Hint for future readers: if message_modulus == 4 then
    // packed_messages_per_lwe becomes 2
-    auto packed_messages_per_lwe = log2_int(params.message_modulus);
+    auto num_packed_msgs = log2_int(params.message_modulus);

    // Adjust indexes to permute the output and access the correct LUT
    auto h_indexes_in = static_cast<Torus *>(
-        malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_indexes_out = static_cast<Torus *>(
-        malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_lut_indexes = static_cast<Torus *>(
-        malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_body_id_per_compact_list =
        static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));
    auto h_lwe_compact_input_indexes =
@@ -138,6 +138,10 @@ template <typename Torus> struct zk_expand_mem {
    auto compact_list_id = 0;
    auto idx = 0;
    auto count = 0;
+    // During flatenning, all num_lwes LWEs from all compact lists are stored
+    // sequentially on a Torus array. h_lwe_compact_input_indexes stores the
+    // index of the first LWE related to the compact list that contains the i-th
+    // LWE
    for (int i = 0; i < num_lwes; i++) {
      h_lwe_compact_input_indexes[i] = idx;
      count++;
@@ -148,6 +152,8 @@ template <typename Torus> struct zk_expand_mem {
      }
    }

+    // Stores the index of the i-th LWE (within each compact list) related to
+    // the k-th compact list.
    auto offset = 0;
    for (int k = 0; k < num_compact_lists; k++) {
      auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
@@ -159,46 +165,75 @@ template <typename Torus> struct zk_expand_mem {
      offset += num_lwes_in_kth_compact_list;
    }

+    /*
+     * Each LWE contains encrypted data in both carry and message spaces
+     * that needs to be extracted.
+     *
+     * The loop processes each compact list (k) and for each LWE within that
+     * list:
+     * 1. Sets input indexes to read each LWE twice (for carry and message
+     * extraction)
+     * 2. Creates output indexes to properly reorder the results
+     * 3. Selects appropriate LUT index based on whether boolean sanitization is
+     * needed
+     *
+     * We want the output to have always first the content of the message part
+     * and then the content of the carry part of each LWE.
+     *
+     * i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
+     * carry_extract(LWE_1), ...
+     *
+     * Aiming that behavior, with 4 LWEs we would have:
+     *
+     * // Each LWE is processed twice
+     * h_indexes_in   = {0, 1, 2, 3, 0, 1, 2, 3}
+     *
+     * // First 4 use message LUT, last 4 use carry LUT
+     * h_lut_indexes  = {0, 0, 0, 0, 1, 1, 1, 1}
+     *
+     * // Reorders output so message and carry for each LWE appear together
+     * h_indexes_out  = {0, 2, 4, 6, 1, 3, 5, 7}
+     *
+     * If an LWE contains a boolean value, its LUT index is shifted by
+     * num_packed_msgs to use the sanitization LUT (which ensures output is
+     * exactly 0 or 1).
+     */
    offset = 0;
    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
-      for (int i = 0;
-           i < packed_messages_per_lwe * num_lwes_in_kth_compact_list; i++) {
-        Torus j = i % num_lwes_in_kth_compact_list;
-        h_indexes_in[i + packed_messages_per_lwe * offset] = j + offset;
-        h_indexes_out[i + packed_messages_per_lwe * offset] =
-            packed_messages_per_lwe * (j + offset) +
-            (i / num_lwes_in_kth_compact_list);
+      auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
+        auto lwe_index = i + num_packed_msgs * offset;
+        auto lwe_index_in_list = i % num_lwes_in_kth;
+        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
+        h_indexes_out[lwe_index] =
+            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
        // If the input relates to a boolean, shift the LUT so the correct one
        // with sanitization is used
-        h_lut_indexes[i + packed_messages_per_lwe * offset] =
-            (is_boolean_array[h_indexes_out[i +
-                                            packed_messages_per_lwe * offset]]
-                 ? packed_messages_per_lwe
-                 : 0) +
-            i / num_lwes_in_kth_compact_list;
+        auto boolean_offset =
+            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
+        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
      }
-      offset += num_lwes_in_kth_compact_list;
+      offset += num_lwes_in_kth;
    }

    message_and_carry_extract_luts->set_lwe_indexes(
        streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
    auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
-    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);

    cuda_memcpy_with_size_tracking_async_to_gpu(
        d_lwe_compact_input_indexes, h_lwe_compact_input_indexes,
        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
        allocate_gpu_memory);
    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lut_indexes, h_lut_indexes,
-        packed_messages_per_lwe * num_lwes * sizeof(Torus), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
+        lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
+        streams[0], gpu_indexes[0], allocate_gpu_memory);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        d_body_id_per_compact_list, h_body_id_per_compact_list,
        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
        allocate_gpu_memory);

+    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);
+
    // The expanded LWEs will always be on the casting key format
    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
        num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -84,15 +84,19 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
      static_cast<uint64_t *>(lwe_array_out), size, log_modulus);
 }

+// This end point is used only for testing purposes
+// its output always follows trivial ordering
 void cuda_improve_noise_modulus_switch_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus) {
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus) {
  host_improve_noise_modulus_switch<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t const *>(lwe_array_in),
+      static_cast<uint64_t const *>(lwe_array_indexes),
      static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
      num_zeros, input_variance, r_sigma, bound, log_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -172,14 +172,14 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(

  // allocate at least LWE-mask times two: to keep both decomposition state and
  // decomposed intermediate value
-  int memory_unit = glwe_accumulator_size > lwe_dimension * 2
-                        ? glwe_accumulator_size
-                        : lwe_dimension * 2;
+  uint64_t memory_unit = glwe_accumulator_size > lwe_dimension * 2
+                             ? glwe_accumulator_size
+                             : lwe_dimension * 2;

  uint64_t size_tracker;
+  uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
  *fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
-      2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index,
-      &size_tracker, allocate_gpu_memory);
+      buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -178,11 +178,12 @@ __device__ __forceinline__ double measure_modulus_switch_noise(

 // Each thread processes two elements of the lwe array
 template <typename Torus>
-__global__ void
-improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
-                             const Torus *zeros, int lwe_size, int num_zeros,
-                             double input_variance, double r_sigma,
-                             double bound, uint32_t log_modulus) {
+__global__ void __launch_bounds__(512)
+    improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
+                                 const uint64_t *indexes, const Torus *zeros,
+                                 int lwe_size, int num_zeros,
+                                 double input_variance, double r_sigma,
+                                 double bound, uint32_t log_modulus) {

  // First we will assume size is less than the number of threads per block
  // I should switch this to dynamic shared memory
@@ -198,13 +199,14 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
  // This probably are not needed cause we are setting the values
  sum_mask_errors[threadIdx.x] = 0.f;
  sum_squared_mask_errors[threadIdx.x] = 0.f;
+  auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
+  // We use modulus switch to gather the output in trivial order
+  auto this_block_lwe_out = array_out + blockIdx.x * lwe_size;
+  Torus input_element1 = this_block_lwe_in[threadIdx.x];

-  Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
-
-  Torus input_element2 =
-      threadIdx.x + blockDim.x < lwe_size
-          ? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
-          : 0;
+  Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
+                             ? this_block_lwe_in[threadIdx.x + blockDim.x]
+                             : 0;

  // Base noise is only handled by thread 0
  double base_noise = measure_modulus_switch_noise<Torus>(
@@ -218,11 +220,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
  __syncthreads();

  if (found)
-    array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
+    this_block_lwe_out[threadIdx.x] = input_element1;

  if (found && (threadIdx.x + blockDim.x) < lwe_size)
-    array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-        input_element2;
+    this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;

  __syncthreads();
  // If we found a zero element we stop iterating (in avg 20 times are
@@ -253,11 +254,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
    // Assumption we always have at least 512 elements
    // If we find a useful zero encryption we replace the lwe by lwe + zero
    if (found)
-      array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
+      this_block_lwe_out[threadIdx.x] = zero_element1;

    if (found && (threadIdx.x + blockDim.x) < lwe_size)
-      array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-          zero_element2;
+      this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;

    __syncthreads();
    // If we found a zero element we stop iterating (in avg 20 times are
@@ -270,9 +270,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
 template <typename Torus>
 __host__ void host_improve_noise_modulus_switch(
    cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
-    Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
-    uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
-    const double r_sigma, const double bound, uint32_t log_modulus) {
+    Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
+    uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
+    const double input_variance, const double r_sigma, const double bound,
+    uint32_t log_modulus) {

  if (lwe_size < 512) {
    PANIC("The lwe_size is less than 512, this is not supported\n");
@@ -289,8 +290,8 @@ __host__ void host_improve_noise_modulus_switch(
  int num_threads = 512, num_blocks = num_lwes;

  improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
-      array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
-      bound, log_modulus);
+      array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
+      r_sigma, bound, log_modulus);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -492,6 +492,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
  batch_convert_u128_to_f128_as_integer<params>
      <<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
                                             d_standard);
+  check_cuda_error(cudaGetLastError());

  // call negacyclic 128 bit forward fft.
  if (full_sm) {
@@ -503,6 +504,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
        <<<grid_size, block_size, shared_memory_size, stream>>>(
            d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
  }
+  check_cuda_error(cudaGetLastError());

  cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
                           gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -63,7 +63,7 @@ void update_degrees_after_bitor(uint64_t *output_degrees,
    auto result = max;

    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
+      if ((max | j) > result) {
        result = max | j;
      }
    }
@@ -82,7 +82,7 @@ void update_degrees_after_bitxor(uint64_t *output_degrees,

    // Try every possibility to find the worst case
    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
+      if ((max ^ j) > result) {
        result = max ^ j;
      }
    }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -36,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
    update_degrees_after_bitor(degrees, lwe_array_1->degrees,
                               lwe_array_2->degrees,
                               lwe_array_1->num_radix_blocks);
-  } else if (mem_ptr->op == BITXOR) {
+  } else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
    update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
                                lwe_array_2->degrees,
                                lwe_array_1->num_radix_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -0,0 +1,62 @@
+#include "cast.cuh"
+
+void extend_radix_with_trivial_zero_blocks_msb_64(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    void *const *streams, uint32_t const *gpu_indexes) {
+  host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
+      output, input, (cudaStream_t *)streams, gpu_indexes);
+}
+
+void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              void *const *streams,
+                              uint32_t const *gpu_indexes) {
+
+  host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
+                                       gpu_indexes);
+}
+
+uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t num_additional_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_extend_radix_with_sign_msb<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count,
+      (int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
+      num_blocks, num_additional_blocks, allocate_gpu_memory);
+}
+
+void cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
+
+  host_extend_radix_with_sign_msb<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
+      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
+      num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
+}
+
+void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
+                                                uint32_t const *gpu_indexes,
+                                                uint32_t gpu_count,
+                                                int8_t **mem_ptr_void) {
+
+  int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
+      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -0,0 +1,94 @@
+#ifndef CAST_CUH
+#define CAST_CUH
+
+#include "device.h"
+#include "integer.cuh"
+#include "integer/integer_utilities.h"
+
+template <typename Torus>
+__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    cudaStream_t const *streams, uint32_t const *gpu_indexes) {
+  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
+                                           0, input->num_radix_blocks, input, 0,
+                                           input->num_radix_blocks);
+}
+
+template <typename Torus>
+__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
+                                         CudaRadixCiphertextFFI const *input,
+                                         cudaStream_t const *streams,
+                                         uint32_t const *gpu_indexes) {
+
+  const uint32_t input_start_lwe_index =
+      input->num_radix_blocks - output->num_radix_blocks;
+
+  if (input->num_radix_blocks <= output->num_radix_blocks) {
+    PANIC("Cuda error: input num blocks should be greater than output num "
+          "blocks");
+  }
+
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
+      input_start_lwe_index, input->num_radix_blocks);
+}
+
+template <typename Torus>
+__host__ uint64_t scratch_extend_radix_with_sign_msb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
+    const int_radix_params params, uint32_t num_radix_blocks,
+    uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      num_additional_blocks, allocate_gpu_memory, &size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_extend_radix_with_sign_msb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *input,
+    int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
+    uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
+
+  if (num_additional_blocks == 0) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
+                                       input);
+    return;
+  }
+
+  const uint32_t input_blocks = input->num_radix_blocks;
+
+  if (input_blocks == 0) {
+    PANIC("Cuda error: input blocks cannot be zero");
+  }
+
+  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
+                                           0, input_blocks, input, 0,
+                                           input_blocks);
+
+  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                           mem_ptr->last_block, 0, 1, input,
+                                           input_blocks - 1, input_blocks);
+
+  host_apply_univariate_lut_kb(
+      streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
+      mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
+
+  for (uint32_t i = 0; i < num_additional_blocks; ++i) {
+    uint32_t dst_block_idx = input_blocks + i;
+
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
+                                             dst_block_idx, dst_block_idx + 1,
+                                             mem_ptr->padding_block, 0, 1);
+  }
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -456,7 +456,7 @@ __host__ void tree_sign_reduction(
  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
-                       4);
+                       message_modulus);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -477,16 +477,17 @@ __host__ void tree_sign_reduction(
  auto last_lut = tree_buffer->tree_last_leaf_lut;
  auto block_selector_f = tree_buffer->block_selector_f;
  std::function<Torus(Torus)> f;
-
+  auto num_bits_in_message = log2_int(params.message_modulus);
  if (partial_block_count == 2) {
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
-                       4);
+                       message_modulus);

-    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
-      int msb = (x >> 2) & 3;
-      int lsb = x & 3;
+    f = [block_selector_f, sign_handler_f, num_bits_in_message,
+         message_modulus](Torus x) -> Torus {
+      Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
+      Torus lsb = x & (message_modulus - 1);

-      int final_sign = block_selector_f(msb, lsb);
+      Torus final_sign = block_selector_f(msb, lsb);
      return sign_handler_f(final_sign);
    };
  } else {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -386,8 +386,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
                         subtraction_overflowed,
                         at_least_one_upper_block_is_non_zero, 1);

-    int factor = (i) ? 3 : 2;
-    int factor_lut_id = factor - 2;
+    auto message_modulus = radix_params.message_modulus;
+    int factor = (i) ? message_modulus - 1 : message_modulus - 2;
+    int factor_lut_id = (i) ? 1 : 0;
    for (size_t k = 0;
         k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
      copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -386,3 +386,69 @@ void reverseArray(uint64_t arr[], size_t n) {
    end--;
  }
 }
+
+uint64_t scratch_cuda_apply_noise_squashing_mem(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int_radix_params params, int_noise_squashing_lut<uint64_t> **mem_ptr,
+    uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_radix_blocks, uint32_t original_num_blocks,
+    bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+  *mem_ptr = new int_noise_squashing_lut<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
+      polynomial_size, num_radix_blocks, original_num_blocks,
+      allocate_gpu_memory, &size_tracker);
+  return size_tracker;
+}
+
+uint64_t scratch_cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t original_num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+  PUSH_RANGE("scratch noise squashing")
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_cuda_apply_noise_squashing_mem(
+      streams, gpu_indexes, gpu_count, params,
+      (int_noise_squashing_lut<uint64_t> **)mem_ptr, input_glwe_dimension,
+      input_polynomial_size, num_radix_blocks, original_num_blocks,
+      allocate_gpu_memory);
+  POP_RANGE()
+}
+
+void cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output_radix_lwe,
+    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks) {
+
+  PUSH_RANGE("apply noise squashing")
+  integer_radix_apply_noise_squashing_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
+      input_radix_lwe, (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)ksks, ms_noise_reduction_key);
+  POP_RANGE()
+}
+
+void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
+                                           uint32_t const *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void) {
+  PUSH_RANGE("cleanup noise squashing")
+  int_noise_squashing_lut<uint64_t> *mem_ptr =
+      (int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -9,6 +9,7 @@
 #include "linear_algebra.h"
 #include "linearalgebra/addition.cuh"
 #include "linearalgebra/negation.cuh"
+#include "pbs/pbs_128_utilities.h"
 #include "pbs/programmable_bootstrap.h"
 #include "polynomial/functions.cuh"
 #include "utils/helper.cuh"
@@ -520,8 +521,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
  if (num_radix_blocks > lut->num_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of lut radix blocks")
-  if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
-      num_radix_blocks > lwe_array_in->num_radix_blocks)
+  if (num_radix_blocks > lwe_array_out->num_radix_blocks)
    PANIC("Cuda error: num radix blocks on which lut is applied should be "
          "smaller or equal to the number of input & output radix blocks")

@@ -866,7 +866,7 @@ uint64_t generate_lookup_table_with_encoding(
  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

  auto body = &acc[glwe_dimension * polynomial_size];
-  uint64_t degree = 0;
+  Torus degree = 0;

  // This accumulator extracts the carry bits
  for (int i = 0; i < input_modulus_sup; i++) {
@@ -886,7 +886,7 @@ uint64_t generate_lookup_table_with_encoding(
  }

  rotate_left<Torus>(body, half_box_size, polynomial_size);
-  return degree;
+  return (uint64_t)degree;
 }

 template <typename Torus>
@@ -1291,7 +1291,7 @@ void host_compute_prefix_sum_hillis_steele(
 }

 // This function is used to perform step 2 of Thomas' new propagation algorithm
-// Consist three steps:
+// Consists of three steps:
 // - propagates the carry within each group with cheap LWE operations stored in
 // simulators
 // - calculates the propagation state of each group
@@ -1616,10 +1616,12 @@ __host__ void reduce_signs(
  auto message_modulus = params.message_modulus;
  auto carry_modulus = params.carry_modulus;

+  auto num_bits_in_message = log2_int(message_modulus);
  std::function<Torus(Torus)> reduce_two_orderings_function =
-      [diff_buffer, sign_handler_f](Torus x) -> Torus {
-    int msb = (x >> 2) & 3;
-    int lsb = x & 3;
+      [diff_buffer, sign_handler_f, num_bits_in_message,
+       message_modulus](Torus x) -> Torus {
+    Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
+    Torus lsb = x & (message_modulus - 1);

    return diff_buffer->tree_buffer->block_selector_f(msb, lsb);
  };
@@ -1640,7 +1642,7 @@ __host__ void reduce_signs(

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
-                         num_sign_blocks, 4);
+                         num_sign_blocks, message_modulus);
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
          ms_noise_reduction_key, lut, num_sign_blocks / 2);
@@ -1669,7 +1671,8 @@ __host__ void reduce_signs(
        message_modulus, carry_modulus, final_lut_f, true);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a, 2, 4);
+    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
+                       num_sign_blocks, message_modulus);
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
        ms_noise_reduction_key, lut, 1);
@@ -1677,8 +1680,8 @@ __host__ void reduce_signs(
  } else {

    std::function<Torus(Torus)> final_lut_f =
-        [mem_ptr, sign_handler_f](Torus x) -> Torus {
-      return sign_handler_f(x & 3);
+        [mem_ptr, sign_handler_f, message_modulus](Torus x) -> Torus {
+      return sign_handler_f(x & (message_modulus - 1));
    };

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
@@ -1831,9 +1834,6 @@ void host_propagate_single_carry(
  PUSH_RANGE("propagate sc")
  auto num_radix_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
-  auto glwe_dimension = params.glwe_dimension;
-  auto polynomial_size = params.polynomial_size;
-  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto lut_stride = mem->lut_stride;
  auto num_many_lut = mem->num_many_lut;
  CudaRadixCiphertextFFI output_flag;
@@ -1849,6 +1849,7 @@ void host_propagate_single_carry(
    host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
                         input_carries, 1);
  }
+
  // Step 1
  host_compute_shifted_blocks_and_states<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
@@ -2197,4 +2198,110 @@ void host_single_borrow_propagate(
  }
 }

+/// num_radix_blocks corresponds to the number of blocks on which to apply the
+/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
+/// the input and output numbers of blocks
+template <typename InputTorus>
+__host__ void integer_radix_apply_noise_squashing_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
+    CudaRadixCiphertextFFI const *lwe_array_in,
+    int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
+    InputTorus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
+
+  PUSH_RANGE("apply noise squashing")
+  auto params = lut->params;
+  auto pbs_type = params.pbs_type;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+  auto small_lwe_dimension = params.small_lwe_dimension;
+  auto ks_level = params.ks_level;
+  auto ks_base_log = params.ks_base_log;
+  auto pbs_level = params.pbs_level;
+  auto pbs_base_log = params.pbs_base_log;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
+  auto grouping_factor = params.grouping_factor;
+
+  if (lwe_array_out->num_radix_blocks !=
+      (lwe_array_in->num_radix_blocks + 1) / 2)
+    PANIC("Cuda error: num output radix blocks should be "
+          "half ceil the number input radix blocks")
+
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
+  std::vector<InputTorus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<InputTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<InputTorus *> lwe_trivial_indexes_vec =
+      lut->lwe_trivial_indexes_vec;
+
+  // We know carry is empty so we can pack two blocks in one
+  pack_blocks<InputTorus>(streams[0], gpu_indexes[0], lwe_array_pbs_in,
+                          lwe_array_in, lwe_array_in->num_radix_blocks,
+                          params.message_modulus);
+
+  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
+  // from the output ct
+  auto active_gpu_count =
+      get_active_gpu_count(lwe_array_out->num_radix_blocks, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<InputTorus>(
+        streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], (InputTorus *)lwe_array_pbs_in->ptr,
+        lut->lwe_indexes_in, ksks, lut->input_big_lwe_dimension,
+        small_lwe_dimension, ks_base_log, ks_level,
+        lwe_array_out->num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_128_async<__uint128_t>(
+        streams, gpu_indexes, 1, (__uint128_t *)lwe_array_out->ptr,
+        lut->lut_vec, lwe_after_ks_vec[0], bsks, ms_noise_reduction_key,
+        lut->pbs_buffer, small_lwe_dimension, glwe_dimension, polynomial_size,
+        pbs_base_log, pbs_level, lwe_array_out->num_radix_blocks);
+  } else {
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
+        (InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
+        lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
+        lut->input_big_lwe_dimension + 1);
+
+    execute_keyswitch_async<InputTorus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
+        lwe_trivial_indexes_vec, lwe_array_in_vec, lwe_trivial_indexes_vec,
+        ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
+        ks_level, lwe_array_out->num_radix_blocks);
+
+    execute_pbs_128_async<__uint128_t>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, lut->lut_vec,
+        lwe_after_ks_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
+        small_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, lwe_array_out->num_radix_blocks);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<__uint128_t>(
+        streams, gpu_indexes, active_gpu_count,
+        (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
+        (__uint128_t *)lut->h_lwe_indexes_out, lut->using_trivial_lwe_indexes,
+        lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
+  for (uint i = 0; i < lut->num_blocks; i++) {
+    lwe_array_out->degrees[i] = lut->degrees[0];
+    lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
+  }
+  POP_RANGE()
+}
+
 #endif // TFHE_RS_INTERNAL_INTEGER_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
@@ -226,72 +226,68 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks,
+    CudaRadixCiphertextFFI *radix_lwe_vec,
+    bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
  if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
    PANIC("Cuda error: input vector length should be a multiple of the "
          "output's number of radix blocks")
-  // FIXME: this should not be necessary, we should make sure sum_ctxt works in
-  // the general case
-  for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
-    radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
-  }
  switch (mem->params.polynomial_size) {
  case 512:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 1024:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<1024>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 2048:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<2048>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 4096:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<4096>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 8192:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<8192>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  case 16384:
    host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
                                                AmortizedDegree<16384>>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
-        radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
+        radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
+        (uint64_t **)(ksks), ms_noise_reduction_key, mem,
        radix_lwe_out->num_radix_blocks,
-        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
-        nullptr);
+        radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
    break;
  default:
    PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -20,28 +20,11 @@
 #include <fstream>
 #include <iostream>
 #include <omp.h>
+#include <queue>
 #include <sstream>
 #include <string>
 #include <vector>

-template <typename Torus>
-__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
-                           int32_t *id_in, size_t lwe_size) {
-  size_t tid = threadIdx.x;
-  size_t b_id = blockIdx.x;
-  size_t stride = blockDim.x;
-
-  auto input_id = id_in[b_id];
-  auto output_id = id_out[b_id];
-
-  auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
-  auto cur_dst = &dst[output_id * lwe_size];
-
-  for (int i = tid; i < lwe_size; i += stride) {
-    cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
-  }
-}
-
 template <typename Torus, class params>
 __global__ void
 all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -94,33 +77,155 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
  }
 }

-template <typename Torus>
-__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
-                                uint32_t chunk_size, uint32_t block_size,
-                                uint32_t num_blocks) {
+__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
+                                            uint32_t *const columns_counter,
+                                            const uint64_t *const degrees,
+                                            const uint32_t num_radix_blocks,
+                                            const uint32_t num_radix_in_vec) {

-  size_t stride = blockDim.x;
-  size_t chunk_id = blockIdx.x;
-  size_t chunk_elem_size = chunk_size * num_blocks * block_size;
-  size_t radix_elem_size = num_blocks * block_size;
-  auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
-  auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
-  size_t block_stride = blockIdx.y * block_size;
-  auto result = &dst_radix[block_stride];
-
-  // init shared mem with first radix of chunk
-  size_t tid = threadIdx.x;
-  for (int i = tid; i < block_size; i += stride) {
-    result[i] = src_chunk[block_stride + i];
-  }
-
-  // accumulate rest  of the radixes
-  for (int r_id = 1; r_id < chunk_size; r_id++) {
-    auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
-    for (int i = tid; i < block_size; i += stride) {
-      result[i] += cur_src_radix[block_stride + i];
+  const uint32_t idx = threadIdx.x;
+  size_t cnt = 0;
+  for (int i = 0; i < num_radix_in_vec; i++) {
+    size_t ct_id = i * num_radix_blocks + idx;
+    if (degrees[ct_id] != 0) {
+      columns[idx][cnt] = ct_id;
+      ++cnt;
    }
  }
+  columns_counter[idx] = cnt;
+}
+
+template <typename Torus>
+__global__ inline void prepare_new_columns_and_pbs_indexes(
+    uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
+    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
+    Torus *const lut_indexes, const uint32_t *const *const columns,
+    const uint32_t *const columns_counter, const uint32_t chunk_size) {
+  __shared__ uint32_t counter;
+
+  if (threadIdx.x == 0) {
+    counter = 0;
+  }
+  __syncthreads();
+
+  const uint32_t base_id = threadIdx.x;
+  const uint32_t column_len = columns_counter[base_id];
+
+  uint32_t ct_count = 0;
+  for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
+    // those indexes are for message ciphertexts
+    // for message ciphertexts in and out index should be same
+    const uint32_t in_index = columns[base_id][i];
+    new_columns[base_id][ct_count] = in_index;
+    const uint32_t pbs_index = atomicAdd(&counter, 1);
+    pbs_indexes_in[pbs_index] = in_index;
+    pbs_indexes_out[pbs_index] = in_index;
+    lut_indexes[pbs_index] = 0;
+    ++ct_count;
+  }
+  __syncthreads();
+
+  if (base_id > 0) {
+    const uint32_t prev_base_id = base_id - 1;
+    const uint32_t prev_column_len = columns_counter[prev_base_id];
+
+    for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
+      // those indexes are for carry ciphertexts
+      // for carry ciphertexts input is same as for message
+      // output will be placed to next block in the column
+      const uint32_t in_index = columns[prev_base_id][i];
+      const uint32_t out_index = columns[prev_base_id][i + 1];
+      new_columns[base_id][ct_count] = out_index;
+      const uint32_t pbs_index = atomicAdd(&counter, 1);
+      pbs_indexes_in[pbs_index] = in_index;
+      pbs_indexes_out[pbs_index] = out_index;
+      lut_indexes[pbs_index] = 1;
+      ++ct_count;
+    }
+  }
+
+  const uint32_t start_index = column_len - column_len % chunk_size;
+  for (uint32_t i = start_index; i < column_len; ++i) {
+    new_columns[base_id][ct_count] = columns[base_id][i];
+    ++ct_count;
+  }
+
+  new_columns_counter[base_id] = ct_count;
+}
+
+template <typename Torus>
+__global__ inline void prepare_final_pbs_indexes(
+    Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
+    Torus *const lut_indexes, const uint32_t num_radix_blocks) {
+  int idx = threadIdx.x;
+  pbs_indexes_in[idx] = idx % num_radix_blocks;
+  pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
+  lut_indexes[idx] = idx / num_radix_blocks;
+}
+
+template <typename Torus>
+__global__ void calculate_chunks(Torus *const input_blocks,
+                                 const uint32_t *const *const columns,
+                                 const uint32_t *const columns_counter,
+                                 const uint32_t chunk_size,
+                                 const uint32_t block_size) {
+
+  const uint32_t part_size = blockDim.x;
+  const uint32_t base_id = blockIdx.x;
+  const uint32_t part_id = blockIdx.y;
+  const uint32_t coef_id = part_id * part_size + threadIdx.x;
+
+  if (coef_id >= block_size)
+    return;
+
+  const uint32_t column_len = columns_counter[base_id];
+
+  if (column_len >= chunk_size) {
+    const uint32_t num_chunks = column_len / chunk_size;
+    Torus result = 0;
+
+    for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
+      const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
+      result = input_blocks[first_ct_id * block_size + coef_id];
+
+      for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
+        const uint32_t cur_ct_id =
+            columns[base_id][chunk_id * chunk_size + ct_id];
+        result += input_blocks[cur_ct_id * block_size + coef_id];
+      }
+
+      input_blocks[first_ct_id * block_size + coef_id] = result;
+    }
+  }
+}
+
+template <typename Torus>
+__global__ void calculate_final_chunk_into_radix(
+    Torus *const out_radix, const Torus *const input_blocks,
+    const uint32_t *const *const columns, const uint32_t *const columns_counter,
+    const uint32_t chunk_size, const uint32_t block_size) {
+
+  const uint32_t part_size = blockDim.x;
+  const uint32_t base_id = blockIdx.x;
+  const uint32_t part_id = blockIdx.y;
+  const uint32_t coef_id = part_id * part_size + threadIdx.x;
+
+  if (coef_id >= block_size)
+    return;
+
+  const uint32_t column_len = columns_counter[base_id];
+
+  Torus result = 0;
+  if (column_len) {
+    const uint32_t first_ct_id = columns[base_id][0];
+    result = input_blocks[first_ct_id * block_size + coef_id];
+
+    for (uint32_t i = 1; i < column_len; ++i) {
+      const uint32_t cur_ct_it = columns[base_id][i];
+      result += input_blocks[cur_ct_it * block_size + coef_id];
+    }
+  }
+  out_radix[base_id * block_size + coef_id] = result;
 }

 template <typename Torus, class params>
@@ -167,6 +272,113 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
        (process_msb) ? cur_msb_ct[params::degree] : 0;
  }
 }
+
+struct radix_columns {
+  std::vector<size_t> columns_counter;
+  size_t num_blocks;
+  size_t num_radix_in_vec;
+  size_t chunk_size;
+  radix_columns(const uint64_t *const input_degrees, size_t num_blocks,
+                size_t num_radix_in_vec, size_t chunk_size,
+                bool &needs_processing)
+      : num_blocks(num_blocks), num_radix_in_vec(num_radix_in_vec),
+        chunk_size(chunk_size) {
+    needs_processing = false;
+    columns_counter.resize(num_blocks, 0);
+    for (size_t i = 0; i < num_radix_in_vec; ++i) {
+      for (size_t j = 0; j < num_blocks; ++j) {
+        if (input_degrees[i * num_blocks + j])
+          columns_counter[j] += 1;
+      }
+    }
+
+    for (size_t i = 0; i < num_blocks; ++i) {
+      if (columns_counter[i] > chunk_size) {
+        needs_processing = true;
+        break;
+      }
+    }
+  }
+
+  void next_accumulation(size_t &total_ciphertexts, size_t &message_ciphertexts,
+                         bool &needs_processing) {
+    message_ciphertexts = 0;
+    total_ciphertexts = 0;
+    needs_processing = false;
+    for (int i = num_blocks - 1; i > 0; --i) {
+      size_t cur_count = columns_counter[i];
+      size_t prev_count = columns_counter[i - 1];
+      size_t new_count = 0;
+
+      // accumulated_blocks from current columns
+      new_count += cur_count / chunk_size;
+      // all accumulated message blocks needs pbs
+      message_ciphertexts += new_count;
+      // carry blocks from previous columns
+      new_count += prev_count / chunk_size;
+      // both carry and message blocks that needs pbs
+      total_ciphertexts += new_count;
+      // now add remaining non accumulated blocks that does not require pbs
+      new_count += cur_count % chunk_size;
+
+      columns_counter[i] = new_count;
+
+      if (new_count > chunk_size)
+        needs_processing = true;
+    }
+
+    // now do it for 0th block
+    size_t new_count = columns_counter[0] / chunk_size;
+    message_ciphertexts += new_count;
+    total_ciphertexts += new_count;
+    new_count += columns_counter[0] % chunk_size;
+    columns_counter[0] = new_count;
+
+    if (new_count > chunk_size) {
+      needs_processing = true;
+    }
+  }
+};
+
+inline void calculate_final_degrees(uint64_t *const out_degrees,
+                                    const uint64_t *const input_degrees,
+                                    size_t num_blocks, size_t num_radix_in_vec,
+                                    size_t chunk_size,
+                                    uint64_t message_modulus) {
+
+  auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
+    return std::min(message_modulus - 1, degree);
+  };
+  std::vector<std::queue<uint64_t>> columns(num_blocks);
+  for (size_t i = 0; i < num_radix_in_vec; ++i) {
+    for (size_t j = 0; j < num_blocks; ++j) {
+      if (input_degrees[i * num_blocks + j])
+        columns[j].push(input_degrees[i * num_blocks + j]);
+    }
+  }
+
+  for (size_t i = 0; i < num_blocks; ++i) {
+    auto &col = columns[i];
+    while (col.size() > 1) {
+      uint32_t cur_degree = 0;
+      size_t mn = std::min(chunk_size, col.size());
+      for (int j = 0; j < mn; ++j) {
+        cur_degree += col.front();
+        col.pop();
+      }
+      const uint64_t new_degree = get_degree(cur_degree);
+      col.push(new_degree);
+      if ((i + 1) < num_blocks) {
+        columns[i + 1].push(new_degree);
+      }
+    }
+  }
+
+  for (int i = 0; i < num_blocks; i++) {
+    out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
+  }
+}
+
 template <typename Torus>
 __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -185,11 +397,14 @@ template <typename Torus, class params>
 __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
-    CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
+    CudaRadixCiphertextFFI *terms,
+    bool reduce_degrees_for_single_carry_propagation, void *const *bsks,
+    uint64_t *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
-    uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
-    int_radix_lut<Torus> *reused_lut) {
+    uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;

  if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
    PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -199,22 +414,29 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    PANIC("Cuda error: input vector does not have enough blocks")
  if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
    PANIC("Cuda error: output does not have enough blocks")
-  auto new_blocks = mem_ptr->new_blocks;
-  auto new_blocks_copy = mem_ptr->new_blocks_copy;
-  auto old_blocks = mem_ptr->old_blocks;
+  if (num_radix_in_vec == 0)
+    return;
+
+  auto current_blocks = mem_ptr->current_blocks;
  auto small_lwe_vector = mem_ptr->small_lwe_vector;
+  auto d_degrees = mem_ptr->d_degrees;
+  auto d_columns = mem_ptr->d_columns;
+  auto d_columns_counter = mem_ptr->d_columns_counter;
+  auto d_new_columns = mem_ptr->d_new_columns;
+  auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
+  auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
+  auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;

-  auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
-  auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
+  auto luts_message_carry = mem_ptr->luts_message_carry;

-  auto message_modulus = mem_ptr->params.message_modulus;
-  auto carry_modulus = mem_ptr->params.carry_modulus;
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
-  auto big_lwe_size = big_lwe_dimension + 1;
  auto glwe_dimension = mem_ptr->params.glwe_dimension;
  auto polynomial_size = mem_ptr->params.polynomial_size;
  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto small_lwe_size = small_lwe_dimension + 1;
+  auto chunk_size =
+      (mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
+      (mem_ptr->params.message_modulus - 1);
+
+  size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;

  // In the case of extracting a single LWE this parameters are dummy
  uint32_t num_many_lut = 1;
@@ -228,244 +450,195 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
                                             terms, 0, num_radix_blocks);
    return;
  }
-  if (old_blocks != terms) {
-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
-                                       terms);
-  }
+
  if (num_radix_in_vec == 2) {
-    CudaRadixCiphertextFFI old_blocks_slice;
-    as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
-                                     num_radix_blocks, 2 * num_radix_blocks);
-    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                         &old_blocks_slice, num_radix_blocks);
+    CudaRadixCiphertextFFI terms_slice;
+    as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
+                                     2 * num_radix_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
+                         &terms_slice, num_radix_blocks);
    return;
  }

-  size_t r = num_radix_in_vec;
-  size_t total_modulus = message_modulus * carry_modulus;
-  size_t message_max = message_modulus - 1;
-  size_t chunk_size = (total_modulus - 1) / message_max;
-
-  size_t h_lwe_idx_in[terms->num_radix_blocks];
-  size_t h_lwe_idx_out[terms->num_radix_blocks];
-  int32_t h_smart_copy_in[terms->num_radix_blocks];
-  int32_t h_smart_copy_out[terms->num_radix_blocks];
-
-  /// Here it is important to query the default max shared memory on device 0
-  /// instead of cuda_get_max_shared_memory,
-  /// to avoid bugs with tree_add_chunks trying to use too much shared memory
-  auto max_shared_memory = 0;
-  check_cuda_error(cudaDeviceGetAttribute(
-      &max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
-
-  // create lut object for message and carry
-  // we allocate luts_message_carry in the host function (instead of scratch)
-  // to reduce average memory consumption
-  int_radix_lut<Torus> *luts_message_carry;
-  size_t ch_amount = r / chunk_size;
-  if (!ch_amount)
-    ch_amount++;
-  if (reused_lut == nullptr) {
-    luts_message_carry = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_radix_blocks, true, nullptr);
-  } else {
-    luts_message_carry = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
-        2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
+  if (mem_ptr->mem_reuse) {
+    mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
  }
-  auto message_acc = luts_message_carry->get_lut(0, 0);
-  auto carry_acc = luts_message_carry->get_lut(0, 1);

-  // define functions for each accumulator
-  auto lut_f_message = [message_modulus](Torus x) -> Torus {
-    return x % message_modulus;
-  };
-  auto lut_f_carry = [message_modulus](Torus x) -> Torus {
-    return x / message_modulus;
-  };
+  if (current_blocks != terms) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
+                                       current_blocks, terms);
+  }

-  // generate accumulators
-  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], message_acc,
-      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
-      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-      lut_f_message, true);
-  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
-      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
-      message_modulus, carry_modulus, lut_f_carry, true);
-  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+  cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
+                           total_blocks_in_vec * sizeof(uint64_t), streams[0],
+                           gpu_indexes[0]);

-  while (r > 2) {
-    size_t cur_total_blocks = r * num_radix_blocks;
-    size_t ch_amount = r / chunk_size;
-    if (!ch_amount)
-      ch_amount++;
-    dim3 add_grid(ch_amount, num_radix_blocks, 1);
+  cuda_set_device(gpu_indexes[0]);
+  radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
+      d_columns, d_columns_counter, d_degrees, num_radix_blocks,
+      num_radix_in_vec);

-    cuda_set_device(gpu_indexes[0]);
-    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
-        (Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
-        std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
+  bool needs_processing = false;
+  radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
+                                num_radix_in_vec, chunk_size, needs_processing);
+  int number_of_threads = min(256, params::degree);
+  int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
+  const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);

-    check_cuda_error(cudaGetLastError());
+  while (needs_processing) {
+    calculate_chunks<Torus>
+        <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
+            (Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
+            chunk_size, big_lwe_size);

-    size_t total_count = 0;
-    size_t message_count = 0;
-    size_t carry_count = 0;
-    size_t sm_copy_count = 0;
+    prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
+        d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
+        d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
+        d_columns_counter, chunk_size);

-    generate_ids_update_degrees(
-        terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
-        h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
-        message_max, total_count, message_count, carry_count, sm_copy_count);
-    auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
-    auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
-    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
-                                        h_lwe_idx_in, h_lwe_idx_out);
+    size_t total_ciphertexts;
+    size_t total_messages;
+    current_columns.next_accumulation(total_ciphertexts, total_messages,
+                                      needs_processing);

-    size_t copy_size = sm_copy_count * sizeof(int32_t);
-    cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
-                             streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
-                             streams[0], gpu_indexes[0]);
-
-    // inside d_smart_copy_in there are only -1 values
-    // it's fine to call smart_copy with same pointer
-    // as source and destination
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
-        new_blocks, 0, r * num_radix_blocks);
-    smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
-        (Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
-        d_smart_copy_out, d_smart_copy_in, big_lwe_size);
-    check_cuda_error(cudaGetLastError());
-
-    if (carry_count > 0)
-      cuda_set_value_async<Torus>(
-          streams[0], gpu_indexes[0],
-          luts_message_carry->get_lut_indexes(0, message_count), 1,
-          carry_count);
-
-    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
-
-    /// For multi GPU execution we create vectors of pointers for inputs and
-    /// outputs
-    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
-    std::vector<Torus *> small_lwe_vector_vec =
-        luts_message_carry->lwe_after_ks_vec;
-    std::vector<Torus *> lwe_after_pbs_vec =
-        luts_message_carry->lwe_after_pbs_vec;
-    std::vector<Torus *> lwe_trivial_indexes_vec =
-        luts_message_carry->lwe_trivial_indexes_vec;
-
-    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
+    auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
    if (active_gpu_count == 1) {
-      /// Apply KS to go from a big LWE dimension to a small LWE dimension
-      /// After this keyswitch execution, we need to synchronize the streams
-      /// because the keyswitch and PBS do not operate on the same number of
-      /// inputs
      execute_keyswitch_async<Torus>(
          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
-          lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
-          polynomial_size * glwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
-
-      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-      /// dimension to a big LWE dimension
-      execute_pbs_async<Torus>(
-          streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
-          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
-          (Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
-          ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
-          small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
-          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
-    } else {
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-      multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, new_blocks_vec,
-          (Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, message_count,
-          big_lwe_size);
-
-      /// Apply KS to go from a big LWE dimension to a small LWE dimension
-      /// After this keyswitch execution, we need to synchronize the streams
-      /// because the keyswitch and PBS do not operate on the same number of
-      /// inputs
-      execute_keyswitch_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
-          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
+          d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
          ksks, big_lwe_dimension, small_lwe_dimension,
-          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
+          total_messages);

-      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
-      /// different configuration
-      multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
-          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, message_count,
-          small_lwe_size);
-      /// Synchronize all GPUs
-      for (uint i = 0; i < active_gpu_count; i++) {
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-      }
-
-      multi_gpu_scatter_lwe_async<Torus>(
-          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
-          (Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
-          luts_message_carry->using_trivial_lwe_indexes, total_count,
-          small_lwe_size);
-
-      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-      /// dimension to a big LWE dimension
      execute_pbs_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
-          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
-          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
-          lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
+          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
+          d_pbs_indexes_out, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
          polynomial_size, mem_ptr->params.pbs_base_log,
          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
-          total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
-
-      multi_gpu_gather_lwe_async<Torus>(
-          streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
-          lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
-          luts_message_carry->using_trivial_lwe_indexes, total_count,
-          big_lwe_size);
-      /// Synchronize all GPUs
-      for (uint i = 0; i < active_gpu_count; i++) {
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+          total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
+          lut_stride);
+    } else {
+      Torus *h_lwe_indexes_in_pinned;
+      Torus *h_lwe_indexes_out_pinned;
+      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
+                     total_ciphertexts * sizeof(Torus));
+      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
+                     total_ciphertexts * sizeof(Torus));
+      for (uint32_t i = 0; i < total_ciphertexts; i++) {
+        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
+        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
      }
-    }
-    for (uint i = 0; i < total_count; i++) {
-      auto degrees_index = luts_message_carry->h_lut_indexes[i];
-      new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
-      new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
-    }
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
+          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
+          total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      for (uint32_t i = 0; i < total_ciphertexts; i++) {
+        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
+        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
+      }
+      cudaFreeHost(h_lwe_indexes_in_pinned);
+      cudaFreeHost(h_lwe_indexes_out_pinned);

-    int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
-    int new_blocks_created = 2 * ch_amount * num_radix_blocks;
+      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+      luts_message_carry->using_trivial_lwe_indexes = false;

-    if (rem_blocks > 0)
-      copy_radix_ciphertext_slice_async<Torus>(
-          streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
-          new_blocks_created + rem_blocks, old_blocks,
-          cur_total_blocks - rem_blocks, cur_total_blocks);
-    std::swap(new_blocks, old_blocks);
-    r = (new_blocks_created + rem_blocks) / num_radix_blocks;
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, active_gpu_count, current_blocks,
+          current_blocks, bsks, ksks, ms_noise_reduction_key,
+          luts_message_carry, total_ciphertexts);
+    }
+    cuda_set_device(gpu_indexes[0]);
+    std::swap(d_columns, d_new_columns);
+    std::swap(d_columns_counter, d_new_columns_counter);
  }
-  luts_message_carry->release(streams, gpu_indexes, gpu_count);
-  delete (luts_message_carry);

-  CudaRadixCiphertextFFI old_blocks_slice;
-  as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
-                                   num_radix_blocks, 2 * num_radix_blocks);
-  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-                       &old_blocks_slice, num_radix_blocks);
+  calculate_final_chunk_into_radix<Torus>
+      <<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
+          (Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
+          d_columns, d_columns_counter, chunk_size, big_lwe_size);
+
+  if (reduce_degrees_for_single_carry_propagation) {
+    prepare_final_pbs_indexes<Torus>
+        <<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
+            d_pbs_indexes_in, d_pbs_indexes_out,
+            luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
+
+    cuda_memset_async(
+        (Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
+        big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+    auto active_gpu_count =
+        get_active_gpu_count(2 * num_radix_blocks, gpu_count);
+
+    if (active_gpu_count == 1) {
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
+          big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
+          mem_ptr->params.ks_level, num_radix_blocks);
+
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
+          d_pbs_indexes_out, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
+          d_pbs_indexes_in, bsks, ms_noise_reduction_key,
+          luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
+          polynomial_size, mem_ptr->params.pbs_base_log,
+          mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
+          2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
+          lut_stride);
+    } else {
+      uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
+      Torus *h_lwe_indexes_in_pinned;
+      Torus *h_lwe_indexes_out_pinned;
+      cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
+                     num_blocks_in_apply_lut * sizeof(Torus));
+      cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
+                     num_blocks_in_apply_lut * sizeof(Torus));
+      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
+        h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
+        h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
+      }
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
+          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_memcpy_async_to_cpu(
+          h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
+          num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
+        luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
+        luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
+      }
+      cudaFreeHost(h_lwe_indexes_in_pinned);
+      cudaFreeHost(h_lwe_indexes_out_pinned);
+
+      luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+      luts_message_carry->using_trivial_lwe_indexes = false;
+
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
+          bsks, ksks, ms_noise_reduction_key, luts_message_carry,
+          num_blocks_in_apply_lut);
+    }
+    calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
+                            num_radix_blocks, num_radix_in_vec, chunk_size,
+                            mem_ptr->params.message_modulus);
+    cuda_set_device(gpu_indexes[0]);
+    CudaRadixCiphertextFFI current_blocks_slice;
+    as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
+                                     num_radix_blocks, 2 * num_radix_blocks);
+
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
+                         current_blocks, &current_blocks_slice,
+                         num_radix_blocks);
+  }
 }

 template <typename Torus, class params>
@@ -600,9 +773,9 @@ __host__ void host_integer_mult_radix_kb(
    terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
  }
  host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
-      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
-      ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
-      2 * num_blocks, mem_ptr->luts_array);
+      streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true,
+      bsks, ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem,
+      num_blocks, 2 * num_blocks);

  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -34,7 +34,7 @@ void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
    auto result = max;

    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
+      if ((max | j) > result) {
        result = max | j;
      }
    }
@@ -52,7 +52,7 @@ void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,

    // Try every possibility to find the worst case
    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
+      if ((max ^ j) > result) {
        result = max ^ j;
      }
    }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cu
@@ -6,7 +6,8 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
+    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
+    bool allocate_ms_array) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -17,7 +18,28 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
  return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
-      allocate_gpu_memory);
+      num_scalar_bits, allocate_gpu_memory);
+}
+
+uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_cuda_integer_radix_scalar_mul_high_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_scalar_mul_high<uint64_t> **)mem_ptr, num_blocks, params,
+      num_scalar_bits, anticipated_buffer_drop, allocate_gpu_memory);
 }

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
@@ -83,6 +105,21 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
  }
 }

+void cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
+    uint64_t rhs, uint64_t const *decomposed_scalar,
+    uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars) {
+
+  host_integer_radix_scalar_mul_high_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, ct,
+      (int_scalar_mul_high<uint64_t> *)mem_ptr, (uint64_t **)ksks, rhs,
+      decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
+      num_scalars);
+}
+
 void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
                                           uint32_t const *gpu_indexes,
                                           uint32_t gpu_count,
@@ -93,3 +130,13 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,

  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_scalar_mul_high<uint64_t> *mem_ptr =
+      (int_scalar_mul_high<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -6,6 +6,7 @@
 #include <cuda_runtime.h>
 #endif

+#include "cast.cuh"
 #include "device.h"
 #include "integer/integer_utilities.h"
 #include "multiplication.cuh"
@@ -32,12 +33,12 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
    uint32_t num_radix_blocks, int_radix_params params,
-    bool allocate_gpu_memory) {
+    uint32_t num_scalar_bits, bool allocate_gpu_memory) {

  uint64_t size_tracker = 0;
  *mem_ptr = new int_scalar_mul_buffer<T>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-      allocate_gpu_memory, true, &size_tracker);
+      num_scalar_bits, allocate_gpu_memory, true, &size_tracker);
  return size_tracker;
 }

@@ -115,13 +116,10 @@ __host__ void host_integer_scalar_mul_radix(
    set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
                                             lwe_array, 0, num_radix_blocks);
  } else {
-    for (int i = 0; i < j * num_radix_blocks; i++) {
-      all_shifted_buffer->degrees[i] = message_modulus - 1;
-    }
    host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
-        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
-        ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
-        num_radix_blocks, j, nullptr);
+        streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true,
+        bsks, ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
+        num_radix_blocks, j);

    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -170,4 +168,109 @@ __host__ void host_integer_small_scalar_mul_radix(
    output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
  }
 }
+
+template <typename Torus>
+__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_scalar_mul_high<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params,
+    uint32_t num_scalar_bits, bool anticipated_buffer_drop,
+    bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_scalar_mul_high<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      allocate_gpu_memory, LEFT_SHIFT, num_scalar_bits, anticipated_buffer_drop,
+      &size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_integer_radix_scalar_mul_high_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
+    int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
+    uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_scalars) {
+
+  if (rhs == (uint64_t)0) {
+    set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
+                                                 0, ct->num_radix_blocks);
+    return;
+  }
+
+  CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
+
+  host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
+                                                        gpu_indexes);
+
+  if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
+    if ((rhs & (rhs - 1)) == 0) {
+
+      uint32_t shift = std::log2(rhs);
+
+      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+          streams, gpu_indexes, gpu_count, tmp_ffi, shift,
+          mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
+          ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
+
+    } else {
+
+      switch (mem_ptr->params.polynomial_size) {
+      case 512:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 1024:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 2048:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 4096:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 8192:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      case 16384:
+        host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
+            streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
+            has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
+            (uint64_t **)ksks, ms_noise_reduction_key,
+            mem_ptr->params.message_modulus, num_scalars);
+        break;
+      default:
+        PANIC(
+            "Cuda error (scalar multiplication): unsupported polynomial size. "
+            "Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
+      }
+    }
+  }
+
+  host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
+}
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cu
@@ -0,0 +1,46 @@
+#include "subtraction.cuh"
+
+uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus, allocate_ms_array);
+
+  return scratch_cuda_sub_and_propagate_single_carry<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_sub_and_propagate<uint64_t> **)mem_ptr, num_blocks, params,
+      requested_flag, allocate_gpu_memory);
+}
+
+void cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
+    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry) {
+
+  host_sub_and_propagate_single_carry<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
+      carry_out, carry_in, (int_sub_and_propagate<uint64_t> *)mem_ptr, bsks,
+      (uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
+}
+
+void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
+                                                 uint32_t const *gpu_indexes,
+                                                 uint32_t gpu_count,
+                                                 int8_t **mem_ptr_void) {
+
+  int_sub_and_propagate<uint64_t> *mem_ptr =
+      (int_sub_and_propagate<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/subtraction.cuh
@@ -8,7 +8,46 @@

 #include "device.h"
 #include "integer/integer.h"
-#include "linear_algebra.h"
+#include "integer/integer_utilities.h"
+#include "negation.cuh"
+#include "pbs/pbs_enums.h"
+
+template <typename Torus>
+uint64_t scratch_cuda_sub_and_propagate_single_carry(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_sub_and_propagate<Torus> **mem_ptr,
+    uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
+    bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_sub_and_propagate<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
+      allocate_gpu_memory, &size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+void host_sub_and_propagate_single_carry(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *lhs_array,
+    const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
+    const CudaRadixCiphertextFFI *input_carries,
+    int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry) {
+
+  host_integer_radix_negation<Torus>(
+      streams, gpu_indexes, gpu_count, mem->neg_rhs_array, rhs_array,
+      mem->params.message_modulus, mem->params.carry_modulus,
+      mem->neg_rhs_array->num_radix_blocks);
+
+  host_add_and_propagate_single_carry<Torus>(
+      streams, gpu_indexes, gpu_count, lhs_array, mem->neg_rhs_array, carry_out,
+      input_carries, mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key,
+      requested_flag, uses_carry);
+}

 template <typename Torus>
 __host__ void host_integer_radix_subtraction(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -261,6 +261,8 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
  default:
    break;
  }
+  check_cuda_error(cudaGetLastError());
+
  cuda_drop_async(buffer, stream, gpu_index);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -279,6 +279,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    PANIC("Cuda error (convert KSK): unsupported polynomial size. Supported "
          "N's are powers of two in the interval [256..16384].")
  }
+  check_cuda_error(cudaGetLastError());

  cuda_drop_async(d_bsk, stream, gpu_index);
  cuda_drop_async(buffer, stream, gpu_index);
@@ -315,6 +316,7 @@ void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
  // convert u128 into 4 x double
  batch_convert_u128_to_f128_strided_as_torus<params>
      <<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
+  check_cuda_error(cudaGetLastError());

  // call negacyclic 128 bit forward fft.
  if (full_sm) {
@@ -326,6 +328,7 @@ void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
        <<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
                                                                buffer);
  }
+  check_cuda_error(cudaGetLastError());
  cuda_drop_async(buffer, stream, gpu_index);
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -194,7 +194,8 @@ void execute_pbs_async(
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);

        void *zeros = nullptr;
-        if (ms_noise_reduction_key != nullptr)
+        if (ms_noise_reduction_key != nullptr &&
+            ms_noise_reduction_key->ptr != nullptr)
          zeros = ms_noise_reduction_key->ptr[i];
        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
            streams[i], gpu_indexes[i], current_lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_128.cuh
@@ -0,0 +1,46 @@
+#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
+#define CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
+#include "pbs/pbs_128_utilities.h"
+
+static void
+execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+                        uint32_t lwe_dimension, uint32_t glwe_dimension,
+                        uint32_t polynomial_size, uint32_t level_count,
+                        uint32_t input_lwe_ciphertext_count,
+                        bool allocate_gpu_memory, bool allocate_ms_array,
+                        uint64_t *size_tracker_on_gpu) {
+  // The squash noise function receives as input 64-bit integers
+  *size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
+      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
+      polynomial_size, level_count, input_lwe_ciphertext_count,
+      allocate_gpu_memory, allocate_ms_array);
+}
+template <typename Torus>
+static void execute_pbs_128_async(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, const LweArrayVariant<__uint128_t> &lwe_array_out,
+    const std::vector<Torus *> lut_vector,
+    const LweArrayVariant<uint64_t> &lwe_array_in,
+    void *const *bootstrapping_keys,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    std::vector<int8_t *> pbs_buffer, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples) {
+
+  for (uint32_t i = 0; i < gpu_count; i++) {
+    int num_inputs_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
+
+    Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+    uint64_t *current_lwe_array_in = GET_VARIANT_ELEMENT_64BIT(lwe_array_in, i);
+    void *zeros = nullptr;
+    if (ms_noise_reduction_key != nullptr)
+      zeros = ms_noise_reduction_key->ptr[i];
+
+    cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
+        streams[i], gpu_indexes[i], current_lwe_array_out, lut_vector[i],
+        current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
+        zeros, pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
+        base_log, level_count, num_inputs_on_gpu);
+  }
+}
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -46,7 +46,7 @@ __global__ void device_programmable_bootstrap_cg(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *device_mem,
    uint64_t device_memory_size_per_block, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+    uint32_t lut_stride, bool uses_noise_reduction) {

  grid_group grid = this_grid();

@@ -80,7 +80,9 @@ __global__ void device_programmable_bootstrap_cg(
  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
  const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+      uses_noise_reduction
+          ? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
+          : &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
@@ -263,7 +265,9 @@ __host__ void host_programmable_bootstrap_cg(
  int thds = polynomial_size / params::opt;
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);

-  void *kernel_args[16];
+  bool uses_noise_reduction = buffer->uses_noise_reduction;
+
+  void *kernel_args[17];
  kernel_args[0] = &lwe_array_out;
  kernel_args[1] = &lwe_output_indexes;
  kernel_args[2] = &lut_vector;
@@ -279,6 +283,7 @@ __host__ void host_programmable_bootstrap_cg(
  kernel_args[12] = &d_mem;
  kernel_args[14] = &num_many_lut;
  kernel_args[15] = &lut_stride;
+  kernel_args[16] = &uses_noise_reduction;

  if (max_shared_memory < partial_sm) {
    kernel_args[13] = &full_dm;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -660,22 +660,17 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;

  // If the parameters contain noise reduction key, then apply it
-  if (ms_noise_reduction_key != nullptr) {
-    if (ms_noise_reduction_key->num_zeros != 0) {
-      uint32_t log_modulus = log2(polynomial_size) + 1;
-      host_improve_noise_modulus_switch<uint64_t>(
-          static_cast<cudaStream_t>(stream), gpu_index,
-          buffer->temp_lwe_array_in,
-          static_cast<uint64_t const *>(lwe_array_in),
-          static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
-          num_samples, ms_noise_reduction_key->num_zeros,
-          ms_noise_reduction_key->ms_input_variance,
-          ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
-          log_modulus);
-    } else {
-      buffer->temp_lwe_array_in =
-          const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
-    }
+  if (buffer->uses_noise_reduction) {
+    uint32_t log_modulus = log2(polynomial_size) + 1;
+    host_improve_noise_modulus_switch<uint64_t>(
+        static_cast<cudaStream_t>(stream), gpu_index, buffer->temp_lwe_array_in,
+        static_cast<uint64_t const *>(lwe_array_in),
+        static_cast<uint64_t const *>(lwe_input_indexes),
+        static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
+        num_samples, ms_noise_reduction_key->num_zeros,
+        ms_noise_reduction_key->ms_input_variance,
+        ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
+        log_modulus);
  } else {
    buffer->temp_lwe_array_in =
        const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
@@ -846,4 +841,7 @@ template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
    bool allocate_ms_array);
+template bool
+supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+    __uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -27,7 +27,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
        double2 *global_join_buffer, uint32_t lwe_iteration,
        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
        uint32_t level_count, int8_t *device_mem,
-        uint64_t device_memory_size_per_block) {
+        uint64_t device_memory_size_per_block, bool uses_noise_reduction) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -55,7 +55,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
  const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
+      uses_noise_reduction
+          ? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
+          : &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];

  const Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
@@ -397,7 +399,8 @@ __host__ void execute_step_one(
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
-    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
+    bool uses_noise_reduction) {

  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  cuda_set_device(gpu_index);
@@ -410,20 +413,21 @@ __host__ void execute_step_one(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            global_accumulator, global_join_buffer, lwe_iteration,
            lwe_dimension, polynomial_size, base_log, level_count, d_mem,
-            full_dm);
+            full_dm, uses_noise_reduction);
  } else if (max_shared_memory < full_sm) {
    device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, first_iter>
        <<<grid, thds, partial_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            global_accumulator, global_join_buffer, lwe_iteration,
            lwe_dimension, polynomial_size, base_log, level_count, d_mem,
-            partial_dm);
+            partial_dm, uses_noise_reduction);
  } else {
    device_programmable_bootstrap_step_one<Torus, params, FULLSM, first_iter>
        <<<grid, thds, full_sm, stream>>>(
            lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
            global_accumulator, global_join_buffer, lwe_iteration,
-            lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0);
+            lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
+            uses_noise_reduction);
  }
  check_cuda_error(cudaGetLastError());
 }
@@ -504,6 +508,7 @@ __host__ void host_programmable_bootstrap(
  Torus *global_accumulator = pbs_buffer->global_accumulator;
  double2 *global_join_buffer = pbs_buffer->global_join_buffer;
  int8_t *d_mem = pbs_buffer->d_mem;
+  bool uses_noise_reduction = pbs_buffer->uses_noise_reduction;

  for (int i = 0; i < lwe_dimension; i++) {
    if (i == 0) {
@@ -512,14 +517,16 @@ __host__ void host_programmable_bootstrap(
          lwe_input_indexes, bootstrapping_key, global_accumulator,
          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
          glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-          partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
+          partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
+          uses_noise_reduction);
    } else {
      execute_step_one<Torus, params, false>(
          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
          lwe_input_indexes, bootstrapping_key, global_accumulator,
          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
          glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-          partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
+          partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
+          uses_noise_reduction);
    }
    if (i == lwe_dimension - 1) {
      execute_step_two<Torus, params, true>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -8,124 +8,67 @@ bool has_support_to_cuda_programmable_bootstrap_128_cg(
      max_shared_memory);
 }

-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
- * the GPU in case FULLSM or PARTIALSM mode is going to be used.
- */
+uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  return scratch_cuda_programmable_bootstrap_128_vector<uint64_t>(
+      stream, gpu_index,
+      (pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> **)pbs_buffer,
+      lwe_dimension, glwe_dimension, polynomial_size, level_count,
+      input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+}
+
 uint64_t scratch_cuda_programmable_bootstrap_128(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {

-  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-  auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
-
-  if (has_support_to_cuda_programmable_bootstrap_128_cg(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory)) {
-    switch (polynomial_size) {
-    case 256:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 512:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 1024:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 2048:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 4096:
-      return scratch_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    default:
-      PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
-            "Supported N's are powers of two"
-            " in the interval [256..4096].")
-    }
-  } else {
-    switch (polynomial_size) {
-    case 256:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 512:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 1024:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 2048:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    case 4096:
-      return scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
-          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
-    default:
-      PANIC("Cuda error (classical PBS): unsupported polynomial size. "
-            "Supported N's are powers of two"
-            " in the interval [256..4096].")
-    }
-  }
+  return scratch_cuda_programmable_bootstrap_128_vector_64(
+      stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
+      polynomial_size, level_count, input_lwe_ciphertext_count,
+      allocate_gpu_memory, allocate_ms_array);
 }

-template <typename Torus>
+template <typename InputTorus>
 void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lut_vector, Torus *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_128<AmortizedDegree<256>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_128<AmortizedDegree<512>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_128<AmortizedDegree<1024>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_128<AmortizedDegree<2048>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
-    host_programmable_bootstrap_128<AmortizedDegree<4096>>(
+    host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
@@ -137,41 +80,42 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
  }
 }

-template <typename Torus>
+template <typename InputTorus>
 void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus const *lut_vector, Torus *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
+    void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {

  switch (polynomial_size) {
  case 256:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 512:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 1024:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 2048:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
    break;
  case 4096:
-    host_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
+    host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
        lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
        polynomial_size, base_log, level_count, num_samples);
@@ -183,6 +127,57 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
  }
 }

+template <typename InputTorus>
+void host_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    __uint128_t const *lut_vector, void const *lwe_array_in,
+    void const *bootstrapping_key,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void const *ms_noise_reduction_ptr,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
+  if (base_log > 64)
+    PANIC("Cuda error (classical PBS): base log should be <= 64")
+
+  // If the parameters contain noise reduction key, then apply it
+  if (ms_noise_reduction_key->num_zeros != 0) {
+    uint32_t log_modulus = log2(polynomial_size) + 1;
+    host_improve_noise_modulus_switch<InputTorus>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<InputTorus *>(buffer->temp_lwe_array_in),
+        static_cast<InputTorus const *>(lwe_array_in),
+        static_cast<uint64_t const *>(buffer->trivial_indexes),
+        static_cast<const InputTorus *>(ms_noise_reduction_ptr),
+        lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
+        ms_noise_reduction_key->ms_input_variance,
+        ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
+        log_modulus);
+  } else {
+    buffer->temp_lwe_array_in =
+        const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
+  }
+  switch (buffer->pbs_variant) {
+  case DEFAULT:
+    executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
+        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+        lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
+        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+    break;
+  case CG:
+    executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
+        InputTorus>(
+        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
+        lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
+        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
+    break;
+  default:
+    PANIC("Cuda error (PBS): unknown pbs variant.")
+  }
+}
+
 /* Perform bootstrapping on a batch of input u128 LWE ciphertexts, storing the
 * result in the same index for each ciphertext.
 *
@@ -237,56 +232,22 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
 */

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
-    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void *streams, uint32_t gpu_index, void *lwe_array_out,
    void const *lut_vector, void const *lwe_array_in,
    void const *bootstrapping_key,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
+    void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples) {
-  if (base_log > 64)
-    PANIC("Cuda error (classical PBS): base log should be <= 64")
+  pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
+      (pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *)mem_ptr;

-  pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
-
-  // If the parameters contain noise reduction key, then apply it
-  if (ms_noise_reduction_key->num_zeros != 0) {
-    uint32_t log_modulus = log2(polynomial_size) + 1;
-    host_improve_noise_modulus_switch<__uint128_t>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
-        static_cast<__uint128_t const *>(lwe_array_in),
-        static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
-        lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
-        ms_noise_reduction_key->ms_input_variance,
-        ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
-        log_modulus);
-  } else {
-    buffer->temp_lwe_array_in = const_cast<__uint128_t *>(
-        static_cast<const __uint128_t *>(lwe_array_in));
-  }
-
-  switch (buffer->pbs_variant) {
-  case DEFAULT:
-    executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
-        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
-        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
-    break;
-  case CG:
-    executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
-        __uint128_t>(
-        stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
-        static_cast<const __uint128_t *>(lut_vector),
-        static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
-        static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
-    break;
-  default:
-    PANIC("Cuda error (PBS): unknown pbs variant.")
-  }
+  host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
+      streams, gpu_index, lwe_array_out,
+      static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
+      bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
+      lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
+      num_samples);
 }

 /*
@@ -295,6 +256,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
 */
 void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
                                             int8_t **buffer) {
-  auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
+  auto x = (pbs_buffer_128<__uint128_t, PBS_TYPE::CLASSICAL> *)(*buffer);
  x->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cuh
@@ -74,16 +74,17 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_128(
  __syncthreads();
 }

-template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
+template <typename InputTorus, class params, sharedMemDegree SMD,
+          bool first_iter>
 __global__ void __launch_bounds__(params::degree / params::opt)
    device_programmable_bootstrap_step_one_128(
-        const Torus *__restrict__ lut_vector,
-        const Torus *__restrict__ lwe_array_in,
-        const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
-        double *global_join_buffer, uint32_t lwe_iteration,
-        uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-        uint32_t level_count, int8_t *device_mem,
-        uint64_t device_memory_size_per_block) {
+        const __uint128_t *__restrict__ lut_vector,
+        const InputTorus *__restrict__ lwe_array_in,
+        const double *__restrict__ bootstrapping_key,
+        __uint128_t *global_accumulator, double *global_join_buffer,
+        uint32_t lwe_iteration, uint32_t lwe_dimension,
+        uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+        int8_t *device_mem, uint64_t device_memory_size_per_block) {

  // We use shared memory for the polynomials that are used often during the
  // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -100,22 +101,22 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    selected_memory = &device_mem[block_index * device_memory_size_per_block];
  }

-  Torus *accumulator = (Torus *)selected_memory;
+  __uint128_t *accumulator = (__uint128_t *)selected_memory;
  double *accumulator_fft =
      (double *)accumulator +
-      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
+      (ptrdiff_t)(sizeof(__uint128_t) * polynomial_size / sizeof(double));

  if constexpr (SMD == PARTIALSM)
    accumulator_fft = (double *)sharedmem;

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];

-  const Torus *block_lut_vector = lut_vector;
+  const __uint128_t *block_lut_vector = lut_vector;

-  Torus *global_slice =
+  __uint128_t *global_slice =
      global_accumulator +
      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;

@@ -127,12 +128,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  if constexpr (first_iter) {
    // First iteration
    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    InputTorus b_hat = 0;
+    modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
+                               params::log2_degree + 1);
    // The y-dimension is used to select the element of the GLWE this block will
    // compute
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+    divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
        false);
@@ -146,20 +147,21 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  }

  // Put "a" in [0, 2N[
-  Torus a_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
-                 params::log2_degree + 1); // 2 * params::log2_degree + 1);
+  InputTorus a_hat = 0;
+  modulus_switch<InputTorus>(block_lwe_array_in[lwe_iteration], a_hat,
+                             params::log2_degree +
+                                 1); // 2 * params::log2_degree + 1);

  __syncthreads();

  // Perform ACC * (X^ä - 1)
  multiply_by_monomial_negacyclic_and_sub_polynomial<
-      Torus, params::opt, params::degree / params::opt>(global_slice,
-                                                        accumulator, a_hat);
+      __uint128_t, params::opt, params::degree / params::opt>(
+      global_slice, accumulator, a_hat);

  // Perform a rounding to increase the accuracy of the
  // bootstrapped ciphertext
-  init_decomposer_state_inplace<Torus, params::opt,
+  init_decomposer_state_inplace<__uint128_t, params::opt,
                                params::degree / params::opt>(
      accumulator, base_log, level_count);

@@ -168,7 +170,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // Decompose the accumulator. Each block gets one level of the
  // decomposition, for the mask and the body (so block 0 will have the
  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
+                                               accumulator);
  gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);

  // We are using the same memory space for accumulator_fft and
@@ -314,10 +317,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 *
 * Each y-block computes one element of the lwe_array_out.
 */
-template <typename Torus, class params, sharedMemDegree SMD>
+template <typename InputTorus, class params, sharedMemDegree SMD>
 __global__ void device_programmable_bootstrap_cg_128(
-    Torus *lwe_array_out, const Torus *__restrict__ lut_vector,
-    const Torus *__restrict__ lwe_array_in,
+    __uint128_t *lwe_array_out, const __uint128_t *__restrict__ lut_vector,
+    const InputTorus *__restrict__ lwe_array_in,
    const double *__restrict__ bootstrapping_key, double *join_buffer,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, int8_t *device_mem,
@@ -342,23 +345,22 @@ __global__ void device_programmable_bootstrap_cg_128(

  // We always compute the pointer with most restrictive alignment to avoid
  // alignment issues
-  Torus *accumulator = (Torus *)selected_memory;
-  Torus *accumulator_rotated =
-      (Torus *)accumulator + (ptrdiff_t)(polynomial_size);
+  __uint128_t *accumulator = (__uint128_t *)selected_memory;
+  __uint128_t *accumulator_rotated =
+      (__uint128_t *)accumulator + (ptrdiff_t)(polynomial_size);
  double *accumulator_fft =
      (double *)(accumulator_rotated) +
-      (ptrdiff_t)(polynomial_size * sizeof(Torus) / sizeof(double));
+      (ptrdiff_t)(polynomial_size * sizeof(__uint128_t) / sizeof(double));

  if constexpr (SMD == PARTIALSM)
    accumulator_fft = (double *)sharedmem;

  // The third dimension of the block is used to determine on which ciphertext
  // this block is operating, in the case of batch bootstraps
-  const Torus *block_lwe_array_in =
+  const InputTorus *block_lwe_array_in =
      &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];

-  const Torus *block_lut_vector =
-      &lut_vector[blockIdx.x * params::degree * (glwe_dimension + 1)];
+  const __uint128_t *block_lut_vector = lut_vector;

  double *block_join_buffer =
      &join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
@@ -368,11 +370,11 @@ __global__ void device_programmable_bootstrap_cg_128(
  // rotated array is not in use anymore by the time we perform the fft

  // Put "b" in [0, 2N[
-  Torus b_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                 params::log2_degree + 1);
+  InputTorus b_hat = 0;
+  modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
+                             params::log2_degree + 1);

-  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+  divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
                                        params::degree / params::opt>(
      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
      false);
@@ -381,17 +383,18 @@ __global__ void device_programmable_bootstrap_cg_128(
    __syncthreads();

    // Put "a" in [0, 2N[
-    Torus a_hat = 0;
-    modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
+    InputTorus a_hat = 0;
+    modulus_switch<InputTorus>(block_lwe_array_in[i], a_hat,
+                               params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
-        Torus, params::opt, params::degree / params::opt>(
+        __uint128_t, params::opt, params::degree / params::opt>(
        accumulator, accumulator_rotated, a_hat);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
-    init_decomposer_state_inplace<Torus, params::opt,
+    init_decomposer_state_inplace<__uint128_t, params::opt,
                                  params::degree / params::opt>(
        accumulator_rotated, base_log, level_count);

@@ -400,8 +403,8 @@ __global__ void device_programmable_bootstrap_cg_128(
    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated);
+    GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
+                                                 accumulator_rotated);
    gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);

    auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
@@ -420,8 +423,9 @@ __global__ void device_programmable_bootstrap_cg_128(
        acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
    __syncthreads();

-    add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
-                                    acc_fft_im_lo, accumulator);
+    add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo,
+                                          acc_fft_im_hi, acc_fft_im_lo,
+                                          accumulator);
  }

  auto block_lwe_array_out =
@@ -433,17 +437,20 @@ __global__ void device_programmable_bootstrap_cg_128(
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      sample_extract_mask<__uint128_t, params>(block_lwe_array_out,
+                                               accumulator);

    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
+                                               0);
    }
  }
 }

-template <typename params>
+template <typename InputTorus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_cg_128(
-    cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
+    cudaStream_t stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {
@@ -457,33 +464,34 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory >= partial_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
+        device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  }

  uint64_t size_tracker = 0;
-  *buffer = new pbs_buffer_128<CLASSICAL>(
+  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

-template <typename params>
+template <typename InputTorus, typename params>
 __host__ uint64_t scratch_programmable_bootstrap_128(
-    cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
+    cudaStream_t stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
    bool allocate_gpu_memory, bool allocate_ms_array) {
@@ -504,37 +512,37 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
  // Configure step one
  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params,
+        device_programmable_bootstrap_step_one_128<InputTorus, params,
                                                   PARTIALSM, false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
  } else if (max_shared_memory >= partial_sm) {
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   true>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   true>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   false>,
        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+        device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                                   false>,
        cudaFuncCachePreferShared);
    check_cuda_error(cudaGetLastError());
@@ -580,17 +588,122 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
  }

  uint64_t size_tracker = 0;
-  *buffer = new pbs_buffer_128<CLASSICAL>(
+  *buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
      stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
      level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
      allocate_gpu_memory, allocate_ms_array, &size_tracker);
  return size_tracker;
 }

-template <class params, bool first_iter>
+/*
+ * This scratch function allocates the necessary amount of data on the GPU for
+ * the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
+ * the GPU in case FULLSM or PARTIALSM mode is going to be used.
+ */
+template <typename InputTorus>
+uint64_t scratch_cuda_programmable_bootstrap_128_vector(
+    void *stream, uint32_t gpu_index,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+  auto buffer = (pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **)pbs_buffer;
+
+  if (has_support_to_cuda_programmable_bootstrap_128_cg(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, max_shared_memory)) {
+    switch (polynomial_size) {
+    case 256:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<256>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 512:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<512>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 1024:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<1024>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 2048:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<2048>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 4096:
+      return scratch_programmable_bootstrap_cg_128<InputTorus,
+                                                   AmortizedDegree<4096>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    default:
+      PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
+            "Supported N's are powers of two"
+            " in the interval [256..4096].")
+    }
+  } else {
+    switch (polynomial_size) {
+    case 256:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<256>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 512:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<512>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 1024:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<1024>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 2048:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<2048>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    case 4096:
+      return scratch_programmable_bootstrap_128<InputTorus,
+                                                AmortizedDegree<4096>>(
+          static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
+      break;
+    default:
+      PANIC("Cuda error (classical PBS): unsupported polynomial size. "
+            "Supported N's are powers of two"
+            " in the interval [256..4096].")
+    }
+  }
+}
+
+template <typename InputTorus, class params, bool first_iter>
 __host__ void execute_step_one_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
-    __uint128_t *lwe_array_in, double const *bootstrapping_key,
+    InputTorus *lwe_array_in, double const *bootstrapping_key,
    __uint128_t *global_accumulator, double *global_join_buffer,
    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -603,21 +716,21 @@ __host__ void execute_step_one_128(
  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);

  if (max_shared_memory < partial_sm) {
-    device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM,
+    device_programmable_bootstrap_step_one_128<InputTorus, params, NOSM,
                                               first_iter>
        <<<grid, thds, 0, stream>>>(
            lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
            global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
            base_log, level_count, d_mem, full_dm);
  } else if (max_shared_memory < full_sm) {
-    device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM,
+    device_programmable_bootstrap_step_one_128<InputTorus, params, PARTIALSM,
                                               first_iter>
        <<<grid, thds, partial_sm, stream>>>(
            lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
            global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
            base_log, level_count, d_mem, partial_dm);
  } else {
-    device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
+    device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
                                               first_iter>
        <<<grid, thds, full_sm, stream>>>(
            lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
@@ -670,11 +783,12 @@ __host__ void execute_step_two_128(
 /*
 * Host wrapper to the programmable bootstrap 128
 */
-template <class params>
+template <typename InputTorus, class params>
 __host__ void host_programmable_bootstrap_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    __uint128_t const *lut_vector, __uint128_t *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *pbs_buffer,
+    __uint128_t const *lut_vector, InputTorus *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count) {
@@ -704,14 +818,14 @@ __host__ void host_programmable_bootstrap_128(

  for (int i = 0; i < lwe_dimension; i++) {
    if (i == 0) {
-      execute_step_one_128<params, true>(
+      execute_step_one_128<InputTorus, params, true>(
          stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
          global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
          lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
          d_mem, i, partial_sm, partial_dm_step_one, full_sm_step_one,
          full_dm_step_one);
    } else {
-      execute_step_one_128<params, false>(
+      execute_step_one_128<InputTorus, params, false>(
          stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
          global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
          lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
@@ -736,11 +850,12 @@ __host__ void host_programmable_bootstrap_128(
  }
 }

-template <class params>
+template <typename InputTorus, class params>
 __host__ void host_programmable_bootstrap_cg_128(
    cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
-    __uint128_t const *lut_vector, __uint128_t const *lwe_array_in,
-    double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
+    __uint128_t const *lut_vector, InputTorus const *lwe_array_in,
+    double const *bootstrapping_key,
+    pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
    uint32_t base_log, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count) {
@@ -783,20 +898,20 @@ __host__ void host_programmable_bootstrap_cg_128(
  if (max_shared_memory < partial_sm) {
    kernel_args[10] = &full_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg_128<__uint128_t, params, NOSM>,
+        (void *)device_programmable_bootstrap_cg_128<InputTorus, params, NOSM>,
        grid, thds, (void **)kernel_args, 0, stream));
  } else if (max_shared_memory < full_sm) {
    kernel_args[10] = &partial_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_programmable_bootstrap_cg_128<__uint128_t, params,
-                                                     PARTIALSM>,
+        (void *)
+            device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
        grid, thds, (void **)kernel_args, partial_sm, stream));
  } else {
    int no_dm = 0;
    kernel_args[10] = &no_dm;
    check_cuda_error(cudaLaunchCooperativeKernel(
        (void *)
-            device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
+            device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
        grid, thds, (void **)kernel_args, full_sm, stream));
  }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -398,20 +398,32 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {

+  bool supports_cg =
+      supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
+          glwe_dimension, polynomial_size, level_count,
+          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
 #if (CUDA_ARCH >= 900)
-  if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
+  // On H100s we should be using TBC until num_samples < num_sms / 2.
+  // After that we switch to CG until not supported anymore.
+  // At this point we return to TBC.
+  int num_sms = 0;
+  check_cuda_error(cudaDeviceGetAttribute(
+      &num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
+
+  bool supports_tbc =
+      has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
          input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, cuda_get_max_shared_memory(gpu_index)))
+          level_count, cuda_get_max_shared_memory(gpu_index));
+
+  if (supports_tbc &&
+      !(input_lwe_ciphertext_count > num_sms / 2 && supports_cg))
    return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
        input_lwe_ciphertext_count, allocate_gpu_memory);
  else
 #endif
-      if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
-              uint64_t>(glwe_dimension, polynomial_size, level_count,
-                        input_lwe_ciphertext_count,
-                        cuda_get_max_shared_memory(gpu_index)))
+      if (supports_cg)
    return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
        stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
        glwe_dimension, polynomial_size, level_count,
--- a/Show More
+++ b/Show More