chore(hlapi): Add array conversion from/to Vec<FheType>

Add `From` impl to allow conversion from Vec<FheType> like Vec<FheUint32> to Cpu/Gpu array.
refactor(gpu): creating CudaScalarDivisorFFI for storing decomposed scalars and their metadata
2026-01-11 15:48:20 -05:00 · 2025-07-16 16:54:16 +02:00 · 2025-07-16 07:59:20 +01:00 · 2025-07-15 17:35:47 +02:00 · 2025-07-15 16:03:45 +02:00 · 2025-07-15 12:46:38 +02:00
1047 changed files with 101973 additions and 12370 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+*.hpu filter=lfs diff=lfs merge=lfs -text
+*.bcode filter=lfs diff=lfs merge=lfs -text
+*.cbor filter=lfs diff=lfs merge=lfs -text
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -6,6 +6,7 @@ self-hosted-runner:
    - large_windows_16_latest
    - large_ubuntu_16
    - large_ubuntu_16-22.04
+    - v80-desktop
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -33,7 +33,9 @@ runs:
      if: inputs.github-instance == 'true'
      shell: bash
      run: |
-        TOOLKIT_VERSION="$(echo ${CUDA_VERSION} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
+        # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+        # shellcheck disable=SC2001
+        TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${env.CUDA_KEYRING_PACKAGE}
        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
        sha256sum -c checksum
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -67,49 +67,30 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

-      - name: Use specific data branch
-        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
-        env:
-          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+      # Cache key is an aggregated hash of lfs files hashes
+      - name: Get LFS data sha
+        id: hash-lfs-data
        run: |
-          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
-
-      - name: Get backward compat branch
-        id: backward_compat_branch
-        run: |
-          BRANCH="$(make backward_compat_branch)"
-          echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
-
-      - name: Get backward compat branch head SHA
-        id: backward_compat_sha
-        run: |
-          SHA=$(git ls-remote "${REPO_URL}" refs/heads/"${BACKWARD_COMPAT_BRANCH}" | awk '{print $1}')
+          SHA=$(git lfs ls-files -l -I utils/tfhe-backward-compat-data | sha256sum | cut -d' ' -f1)
          echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
-        env:
-          REPO_URL: "https://github.com/zama-ai/tfhe-backward-compat-data"
-          BACKWARD_COMPAT_BRANCH: ${{ steps.backward_compat_branch.outputs.branch }}

      - name: Retrieve data from cache
        id: retrieve-data-cache
        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: tests/tfhe-backward-compat-data
-          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
+          path: |
+            utils/tfhe-backward-compat-data/**/*.cbor
+            utils/tfhe-backward-compat-data/**/*.bcode
+          key: ${{ steps.hash-lfs-data.outputs.sha }}

-      - name: Clone test data
+      - name: Pull test data
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
-          repository: zama-ai/tfhe-backward-compat-data
-          path: tests/tfhe-backward-compat-data
-          lfs: 'true'
-          ref: ${{ steps.backward_compat_branch.outputs.branch }}
+        run: |
+          make pull_backward_compat_data

      - name: Run backward compatibility tests
        run: |
@@ -120,15 +101,18 @@ jobs:
        continue-on-error: true
        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
        with:
-          path: tests/tfhe-backward-compat-data
-          key: ${{ steps.backward_compat_branch.outputs.branch }}_${{ steps.backward_compat_sha.outputs.sha }}
+          path: |
+            utils/tfhe-backward-compat-data/**/*.cbor
+            utils/tfhe-backward-compat-data/**/*.bcode
+          key: ${{ steps.hash-lfs-data.outputs.sha }}

      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -174,7 +174,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -182,9 +182,11 @@ jobs:
        if: needs.should-run.outputs.csprng_test == 'true'
        run: |
          make test_tfhe_csprng
+          make test_tfhe_csprng_big_endian

      - name: Run tfhe-zk-pok tests
-        if: needs.should-run.outputs.zk_pok_test == 'true'
+        # Always run it to catch non deterministic bugs earlier
+        # if: needs.should-run.outputs.zk_pok_test == 'true'
        run: |
          make test_zk_pok

@@ -272,9 +274,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -103,7 +103,7 @@ jobs:
    name: Unsigned integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -114,7 +114,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -142,9 +142,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -104,7 +104,7 @@ jobs:
    name: Signed integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -115,7 +115,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -147,9 +147,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -185,7 +185,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -254,9 +254,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -68,7 +68,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -123,9 +123,10 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -114,8 +117,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -58,14 +58,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -95,15 +98,27 @@ jobs:
        env:
          REF_NAME: ${{ github.ref_name }}

-      - name: Parse swap request PBS counts
+      - name: Parse swap request update PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

-      - name: Parse swap claim PBS counts
+      - name: Parse swap request finalize PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_finalize_pbs_count.csv "${RESULTS_FILENAME}" \
+          --object-sizes \
+          --append-results
+
+      - name: Parse swap claim prepare PBS counts
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_prepare_pbs_count.csv "${RESULTS_FILENAME}" \
+          --object-sizes \
+          --append-results
+
+      - name: Parse swap claim update PBS counts
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

@@ -116,8 +131,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -59,14 +59,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -111,8 +114,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -31,6 +31,7 @@ on:
          - ks
          - ks_pbs
          - integer_zk
+          - hlapi_noise_squash
      op_flavor:
        description: "Operations set to run"
        type: choice
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -46,15 +46,18 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+            echo "FAST_BENCH=TRUE";
          } >> "${GITHUB_ENV}"
-          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -93,8 +96,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -124,14 +130,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -159,7 +168,8 @@ jobs:
          --commit-date "${COMMIT_DATE}" \
          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
-      
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -170,8 +180,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -84,7 +84,7 @@ jobs:
        run: |
          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
          # shellcheck disable=SC2001
-          PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
+          PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\", \"/g')
          echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"

      - name: Set single operations flavor
@@ -120,25 +120,24 @@ jobs:
        env:
          INPUTS_PARAMS_TYPE: ${{ inputs.params_type }}

-
      - name: Set command output
        id: set_command
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

      - name: Set parameters types output
        id: set_params_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -227,6 +226,8 @@ jobs:
        include:
          - cuda: "12.2"
            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -237,18 +238,20 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      # Re-export environment variables as dependencies setup perform this task in the previous job.
      # Local env variables are cleaned at the end of each job.
      - name: Export CUDA variables
        shell: bash
        run: |
-          CUDA_PATH=/usr/local/cuda-${{ matrix.cuda }}
          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
@@ -258,13 +261,15 @@ jobs:
        shell: bash
        run: |
          {
-          echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-          echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+          echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+          echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -275,11 +280,11 @@ jobs:

      - name: Run benchmarks
        run: |
-          make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" BENCH_PARAM_TYPE="${PARAMS_TYPE}" bench_"${COMMAND}"_gpu
+          make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" BENCH_PARAM_TYPE="${BENCH_PARAMS_TYPE}" bench_"${BENCH_COMMAND}"_gpu
        env:
          OP_FLAVOR: ${{ matrix.op_flavor }}
          BENCH_TYPE: ${{ matrix.bench_type }}
-          BENCH_PARAM_TYPE: ${{ matrix.params_type }}
+          BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
          BENCH_COMMAND: ${{ matrix.command }}

      - name: Parse results
@@ -317,8 +322,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -119,14 +119,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -167,8 +170,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -120,14 +120,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -168,8 +171,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -10,37 +10,16 @@ on:
 permissions: {}

 jobs:
-  run-benchmarks-1-h100:
-    name: Run integer benchmarks (1xH100)
+  run-benchmarks-8-h100-sxm5-integer:
+    name: Run integer benchmarks (8xH100-SXM5)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: single-h100
-      hardware_name: n3-H100x1
-      command: integer,integer_multi_bit
-      op_flavor: default
-      bench_type: latency
-      all_precisions: true
-    secrets:
-      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
-      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-      JOB_SECRET: ${{ secrets.JOB_SECRET }}
-      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
-      SLAB_URL: ${{ secrets.SLAB_URL }}
-      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
-
-  run-benchmarks-2-h100:
-    name: Run integer benchmarks (2xH100)
-    if: github.repository == 'zama-ai/tfhe-rs'
-    uses: ./.github/workflows/benchmark_gpu_common.yml
-    with:
-      profile: 2-h100
-      hardware_name: n3-H100x2
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
      command: integer_multi_bit
      op_flavor: default
-      bench_type: latency
+      bench_type: both
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -52,16 +31,16 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-8-h100:
-    name: Run integer benchmarks (8xH100)
+  run-benchmarks-8-h100-sxm5-integer-compression:
+    name: Run integer compression benchmarks (8xH100-SXM5)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: multi-h100
-      hardware_name: n3-H100x8
-      command: integer_multi_bit
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
+      command: integer_compression
      op_flavor: default
-      bench_type: latency
+      bench_type: both
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -73,16 +52,37 @@ jobs:
      SLAB_URL: ${{ secrets.SLAB_URL }}
      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

-  run-benchmarks-l40:
-    name: Run integer benchmarks (L40)
+  run-benchmarks-8-h100-sxm5-integer-zk:
+    name: Run integer zk benchmarks (8xH100-SXM5)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
-      profile: l40
-      hardware_name: n3-L40x1
-      command: integer_multi_bit,integer_compression,pbs,ks
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
+      command: integer_zk
      op_flavor: default
-      bench_type: latency
+      bench_type: both
+      all_precisions: true
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100-sxm5-noise-squash:
+    name: Run integer zk benchmarks (8xH100-SXM5)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: multi-h100-sxm5
+      hardware_name: n3-H100x8-SXM5
+      command: hlapi_noise_squash
+      op_flavor: default
+      bench_type: both
      all_precisions: true
    secrets:
      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -0,0 +1,96 @@
+# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
+name: Hpu Integer Benchmarks
+
+on:
+  workflow_dispatch:
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+
+permissions: {}
+
+jobs:
+  integer-benchmarks-hpu:
+    name: Execute integer & erc20 benchmarks for HPU backend
+    runs-on: v80-desktop
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    timeout-minutes: 1440  # 24 hours
+    steps:
+      # Needed as long as hw_regmap repository is private
+      - name: Configure SSH
+        uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          lfs: true
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Run benchmarks
+        run: |
+          make pull_hpu_files
+          make bench_integer_hpu
+          make bench_hlapi_erc20_hpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "hpu_x1" \
+          --backend hpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_integer_benchmarks
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -78,12 +78,12 @@ jobs:

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -128,14 +128,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -193,8 +196,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -89,14 +89,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -150,8 +153,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -78,12 +78,12 @@ jobs:

      - name: Set operation flavor output
        id: set_op_flavor
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -128,14 +128,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -185,8 +188,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute FFT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
@@ -61,11 +61,14 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -48,7 +48,7 @@ jobs:
    name: Execute NTT benchmarks in EC2
    needs: setup-ec2
    concurrency:
-      group: ${{ github.workflow_ref }}
+      group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
@@ -61,11 +61,14 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -98,14 +98,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -141,7 +144,7 @@ jobs:
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_tfhe_zk_pok
+          name: ${{ github.sha }}_tfhe_zk_pok_${{ env.BENCH_TYPE }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -155,8 +158,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -96,14 +96,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -136,12 +139,16 @@ jobs:

      - name: Install web resources
        run: |
-          make install_${{ matrix.browser }}_browser
-          make install_${{ matrix.browser }}_web_driver
+          make install_"${BROWSER}"_browser
+          make install_"${BROWSER}"_web_driver
+        env:
+          BROWSER: ${{ matrix.browser }}

      - name: Run benchmarks
        run: |
-          make bench_web_js_api_parallel_${{ matrix.browser }}_ci
+          make bench_web_js_api_parallel_"${BROWSER}"_ci
+        env:
+          BROWSER: ${{ matrix.browser }}

      - name: Parse results
        run: |
@@ -188,8 +195,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -92,7 +92,7 @@ jobs:

      - name: Set benchmark types output
        id: set_bench_type
-        run: |
+        run: | # zizmor: ignore[template-injection] this env variable is safe
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

  setup-instance:
@@ -140,14 +140,17 @@ jobs:

      - name: Get benchmark details
        run: |
+          COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
-            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_DATE=${COMMIT_DATE}";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
+        env:
+          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -191,7 +194,7 @@ jobs:
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_integer_zk
+          name: ${{ github.sha }}_integer_zk_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -205,8 +208,11 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
-          --slab-url "${{ secrets.SLAB_URL }}"
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+        env:
+          JOB_SECRET: ${{ secrets.JOB_SECRET }}
+          SLAB_URL: ${{ secrets.SLAB_URL }}

      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -35,7 +35,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -49,6 +49,14 @@ jobs:
          mv linelint-linux-amd64 /usr/local/bin/linelint
          make check_newline

+      # This is needed for the ws tests clippy checks
+      - name: Use specific data branch
+        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
+        env:
+          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
+        run: |
+          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
+
      - name: Run pcc checks
        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
@@ -94,5 +102,10 @@ jobs:
        run: |
          make build_tfhe_coverage

+      - name: Run Hpu pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
+        run: |
+          make pcc_hpu
+
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
  cancel-in-progress: true

 permissions:
@@ -51,7 +51,7 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -82,7 +82,7 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -13,7 +13,7 @@ env:
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
  cancel-in-progress: true

 permissions:
@@ -51,7 +51,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -77,7 +77,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -3,14 +3,15 @@ name: Check commit and PR compliance
 on:
  pull_request:

-permissions:
-  contents: read
-  pull-requests: read # Permission needed to scan commits in a pull-request
+permissions: {}

 jobs:
  check-commit-pr:
    name: Check commit and PR
    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write # Permission needed to scan commits in a pull-request and write issue comment
    steps:
      - name: Check first line
        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -25,10 +25,10 @@ jobs:

      - name: Get actionlint
        run: |
-          wget "https://github.com/rhysd/actionlint/releases/download/v${{ env.ACTIONLINT_VERSION }}/actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz"
-          echo "${{ env.ACTIONLINT_CHECKSUM }} actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz" > checksum
+          wget "https://github.com/rhysd/actionlint/releases/download/v${ACTIONLINT_VERSION}/actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz"
+          echo "${ACTIONLINT_CHECKSUM} actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz" > checksum
          sha256sum -c checksum
-          tar -xf actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz actionlint
+          tar -xf actionlint_"${ACTIONLINT_VERSION}"_linux_amd64.tar.gz actionlint
          ln -s "$(pwd)/actionlint" /usr/local/bin/

      - name: Lint workflows
@@ -38,9 +38,11 @@ jobs:
      - name: Check workflows security
        run: |
          make check_workflow_security
+        env:
+          GH_TOKEN: ${{ env.CHECKOUT_TOKEN }}

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@4830be28ce81da52ec70d65c552a7403821d98d4 # v3.0.23
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@fc87bb5b5a97953d987372e74478de634726b3e5 # v3.0.25
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -54,7 +54,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -90,7 +90,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
+        uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -104,7 +104,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
+        uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -66,7 +66,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -1,137 +0,0 @@
-name: Close or Merge corresponding PR on the data repo
-
-# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
-
-env:
-  TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
-  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  PR_BRANCH: ${{ github.head_ref || github.ref_name }}
-  CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}
-
-# only trigger on pull request closed events
-on:
-  pull_request:
-    types: [ closed ]
-
-# The same pattern is used for jobs that use the github api:
-# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
-# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
-# - "set +e" will make sure we reach the last "echo EOF" even in case of error
-# - "set -o" pipefail makes one line piped command return the error of the first failure
-# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
-# the script will always return 0 because of the "echo EOF".
-
-
-
-permissions: {}
-
-jobs:
-  auto_close_job:
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Find corresponding Pull Request in the data repo
-      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'TARGET_REPO_PR<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X GET \
-          -H "Accept: application/vnd.github+json" \
-          -H "X-GitHub-Api-Version: 2022-11-28"  \
-          "${TARGET_REPO_API_URL}"/pulls\?head="${REPO_OWNER}":"${PR_BRANCH}" | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-      env:
-        REPO_OWNER: ${{ github.repository_owner }}
-
-    - name: Comment on the PR to indicate the reason of the close
-      run: |
-        BODY="'{ \"body\": \"PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}\" }'"
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X POST \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${COMMENTS_URL}" \
-          -d "${BODY}"
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-      env:
-        REPO: ${{ github.repository }}
-        EVENT_NUMBER: ${{ github.event.number }}
-        COMMENTS_URL: ${{ fromJson(env.TARGET_REPO_PR).comments_url }}
-
-    - name: Merge the Pull Request in the data repo
-      if: ${{ github.event.pull_request.merged }}
-      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X PUT \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${TARGET_REPO_PR_URL}"/merge \
-          -d '{ "merge_method": "rebase" }'
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-      env:
-        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
-
-    - name: Close the Pull Request in the data repo
-      if: ${{ !github.event.pull_request.merged }}
-      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X PATCH \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${TARGET_REPO_PR_URL}" \
-          -d '{ "state": "closed" }'
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-      env:
-        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
-
-    - name: Delete the associated branch in the data repo
-      run: |
-        {
-          set +e
-          set -o pipefail
-          echo 'GH_API_RES<<EOF'
-          curl --fail-with-body --no-progress-meter -L -X DELETE \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-          -H "X-GitHub-Api-Version: 2022-11-28" \
-          "${TARGET_REPO_API_URL}"/git/refs/heads/"${PR_BRANCH}"
-          RES="$?"
-          echo EOF
-        } >> "${GITHUB_ENV}"
-        exit $RES
-
-    - name: Slack Notification
-      if: ${{ always() && job.status == 'failure' }}
-      continue-on-error: true
-      uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
-      env:
-        SLACK_COLOR: ${{ job.status }}
-        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -45,7 +45,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -172,9 +172,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -124,7 +124,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +156,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -79,7 +79,7 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -137,7 +137,7 @@ jobs:
      # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
      - name: Run multi-bit CUDA integer tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_integer_multi_bit_gpu_ci
+          BIG_TESTS_INSTANCE=TRUE NO_BIG_PARAMS_GPU=TRUE make test_integer_multi_bit_gpu_ci

      - name: Run user docs tests
        run: |
@@ -149,7 +149,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          BIG_TESTS_INSTANCE=FALSE make test_high_level_api_gpu
+          make test_high_level_api_gpu

  slack-notify:
    name: Slack Notification
@@ -161,9 +161,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -11,6 +11,7 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  IS_PR: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -18,6 +19,8 @@ on:
  schedule:
    # Nightly tests will be triggered each evening 8p.m.
    - cron: "0 20 * * *"
+  pull_request:
+

 permissions:
  contents: read
@@ -72,13 +75,17 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

      - name: Run tests
        run: |
-          make test_integer_long_run_gpu
+          if [[ "${IS_PR}" == "true" ]]; then
+            make test_integer_short_run_gpu
+          else
+            make test_integer_long_run_gpu
+          fi

  slack-notify:
    name: Slack Notification
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -1,4 +1,4 @@
-# Perfom tfhe-cuda-backend post-commit checks on an AWS instance
+# Perform tfhe-cuda-backend post-commit checks on an AWS instance
 name: Cuda - Post-commit Checks

 env:
@@ -81,16 +81,20 @@ jobs:
        if: env.SECRETS_AVAILABLE == 'false'
        shell: bash
        run: |
-          TOOLKIT_VERSION="$(echo ${{ matrix.cuda }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"${CUDA_KEYRING_PACKAGE}"
          echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
          sha256sum -c checksum
          sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
          sudo apt update
          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -100,17 +104,21 @@ jobs:
          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc" >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Run fmt checks
        run: |
@@ -120,12 +128,17 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Check build with hpu enabled
+        run: |
+          make clippy_gpu_hpu
+
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -144,9 +144,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -158,9 +158,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -25,9 +25,6 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-  schedule:
-    # Nightly tests @ 1AM after each work day
-    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
@@ -130,7 +127,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +153,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -126,7 +126,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -144,9 +144,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -140,7 +140,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -158,9 +158,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -25,9 +25,6 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-  schedule:
-    # Nightly tests @ 1AM after each work day
-    - cron: "0 1 * * MON-FRI"

 permissions:
  contents: read
@@ -130,7 +127,7 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -156,9 +153,10 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
        env:
          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -0,0 +1,73 @@
+# Test tfhe-fft
+name: Cargo Test HLAPI HPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+env:
+  CARGO_TERM_COLOR: always
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
+  cancel-in-progress: true
+
+
+permissions: { }
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read
+    outputs:
+      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        with:
+          files_yaml: |
+            hpu:
+              - tfhe/Cargo.toml
+              - Makefile
+              - backends/tfhe-hpu-backend/**
+              - mockups/tfhe-hpu-mockup/**
+
+  cargo-tests-hpu:
+    needs: should-run
+    if: needs.should-run.outputs.hpu_test == 'true'
+    runs-on: large_ubuntu_16
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Install Just
+        run: |
+          cargo install just
+
+      - name: Test HLAPI HPU
+        run: |
+          source setup_hpu.sh
+          just -f mockups/tfhe-hpu-mockup/Justfile  BUILD_PROFILE=release mockup &
+          make HPU_CONFIG=sim test_high_level_api_hpu
+          make HPU_CONFIG=sim test_user_doc_hpu
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -57,7 +57,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -46,7 +46,7 @@ jobs:
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -67,7 +67,7 @@ jobs:
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -78,19 +78,24 @@ jobs:
          {
            echo "CUDA_PATH=$CUDA_PATH";
            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+            echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
          } >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}
+
      - name: Prepare package
        run: |
          cargo package -p tfhe-cuda-backend
@@ -129,7 +134,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -140,19 +145,23 @@ jobs:
          {
            echo "CUDA_PATH=$CUDA_PATH";
            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+            echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
          } >> "${GITHUB_ENV}"
+        env:
+          CUDA_VERSION: ${{ matrix.cuda }}

      # Specify the correct host compilers
      - name: Export gcc and g++ variables
        if: ${{ !cancelled() }}
        run: |
          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+            echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"
+        env:
+          GCC_VERSION: ${{ matrix.gcc }}

      - name: Publish crate.io package
        env:
--- a/.github/workflows/make_release_hpu.yml
+++ b/.github/workflows/make_release_hpu.yml
@@ -0,0 +1,105 @@
+name: Publish HPU release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+jobs:
+  verify_tag:
+    uses: ./.github/workflows/verify_tagged_commit.yml
+    secrets:
+      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  package:
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-hpu-backend
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: crate
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
+  publish_release:
+    name: Publish tfhe-hpu-backend Release
+    runs-on: ubuntu-latest
+    needs: [verify_tag, package] # for comparing hashes
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
+
+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
+        env:
+          SLACK_COLOR: failure
+          SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -21,7 +21,7 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: git-sync
-        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
+        uses: valtech-sd/git-sync@e734cfe9485a92e720eac5af8a4555dde5fecf88
        with:
          source_repo: "zama-ai/tfhe-rs"
          source_branch: "main"
--- a/.github/workflows/unverified_prs.yml
+++ b/.github/workflows/unverified_prs.yml
@@ -0,0 +1,26 @@
+name: 'Close unverified PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+permissions: {}
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: read
+      pull-requests: write
+    steps:
+      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
+        with:
+          stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
+          days-before-stale: 2
+          days-before-close: 0
+          # We are not interested in suppressing issues so have a currently non existent label
+          # if we ever accept issues to become stale/closable this label will be the signal for that
+          only-issue-labels: can-be-auto-closed
+          # Only unverified PRs are an issue
+          exempt-pr-labels: cla-signed
+          # We don't want people commenting to keep an unverified PR
+          ignore-updates: true
--- a/.gitignore
+++ b/.gitignore
@@ -40,3 +40,6 @@ __pycache__
 # First directive is to ignore symlinks
 tests/tfhe-backward-compat-data
 ci/
+
+# In case someone clones the lattice-estimator locally to verify security
+/lattice-estimator
--- a/.lfsconfig
+++ b/.lfsconfig
@@ -0,0 +1,2 @@
+[lfs]
+  fetchexclude = *
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -10,6 +10,7 @@ ignore:
  - keys
  - coverage
  - utils/tfhe-lints/ui/main.stderr
+  - utils/tfhe-backward-compat-data/**/*.ron # ron files are autogenerated

 rules:
  # checks if file ends in a newline character
--- a/20
+++ b/20
@@ -1,12 +1,28 @@
 # Specifying a path without code owners means that path won't have owners and is akin to a negation
 # i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
 # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
+
+/backends/tfhe-cuda-backend/            @agnesLeroy
+/backends/tfhe-hpu-backend/             @zama-ai/hardware
+
+/tfhe/examples/hpu                      @zama-ai/hardware
+
 /tfhe/src/core_crypto/                  @IceTDrinker
-/tfhe/src/core_crypto/gpu
+/tfhe/src/core_crypto/gpu               @agnesLeroy
+/tfhe/src/core_crypto/hpu               @zama-ai/hardware

 /tfhe/src/shortint/                     @mayeul-zama

 /tfhe/src/integer/                      @tmontaigu
-/tfhe/src/integer/gpu
+/tfhe/src/integer/gpu                   @agnesLeroy
+/tfhe/src/integer/hpu                   @zama-ai/hardware

 /tfhe/src/high_level_api/               @tmontaigu
+
+/Makefile                               @IceTDrinker @soonum
+
+/mockups/tfhe-hpu-mockup                @zama-ai/hardware
+
+/.github/                               @soonum
+
+/CODEOWNERS                             @IceTDrinker
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -170,6 +170,8 @@ On the contrary, these changes are *not* data breaking:
 * Renaming a type (unless it implements the `Named` trait).
 * Adding a variant to the end of an enum.

+Historical data from previous TFHE-rs versions are stored inside `utils/tfhe-backward-compat-data`. They are used to check on every PR that backward compatibility has been preserved.
+
 ## Example: adding a field

 Suppose you want to add an i32 field to a type named `MyType`. The original type is defined as:
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,14 +9,16 @@ members = [
    "tasks",
    "tfhe-csprng",
    "backends/tfhe-cuda-backend",
+    "backends/tfhe-hpu-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
    "utils/param_dedup",
    "tests",
+    "mockups/tfhe-hpu-mockup",
 ]

 exclude = [
-    "tests/backward_compatibility_tests",
+    "utils/tfhe-backward-compat-data",
    "utils/tfhe-lints",
    "apps/trivium",
 ]
--- a/220
+++ b/220
@@ -2,6 +2,7 @@ SHELL:=$(shell /usr/bin/env which bash)
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
+CARGO_BUILD_JOBS=default
 CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -21,10 +22,7 @@ BENCH_TYPE?=latency
 BENCH_PARAM_TYPE?=classical
 BENCH_PARAMS_SET?=default
 NODE_VERSION=22.6
-BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
-BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
-BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
+BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
 TFHE_SPEC:=tfhe
 WASM_PACK_VERSION="0.13.1"
 # We are kind of hacking the cut here, the version cannot contain a quote '"'
@@ -55,6 +53,9 @@ REGEX_PATTERN?=''
 TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

+# tfhe-hpu-backend
+HPU_CONFIG=v80
+
 # Exclude these files from coverage reports
 define COVERAGE_EXCLUDED_FILES
 --exclude-files apps/trivium/src/trivium/* \
@@ -155,20 +156,24 @@ install_tarpaulin: install_rs_build_toolchain

 .PHONY: install_cargo_dylint # Install custom tfhe-rs lints
 install_cargo_dylint:
-	cargo install cargo-dylint dylint-link
+	cargo install --locked cargo-dylint dylint-link

 .PHONY: install_typos_checker # Install typos checker
 install_typos_checker: install_rs_build_toolchain
 	@typos --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked typos-cli || \
 	( echo "Unable to install typos-cli, unknown error." && exit 1 )

 .PHONY: install_zizmor # Install zizmor workflow security checker
 install_zizmor: install_rs_build_toolchain
 	@zizmor --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked zizmor --version ~1.9 || \
 	( echo "Unable to install zizmor, unknown error." && exit 1 )

+.PHONY: install_cargo_cross # Install cross for big endian tests
+install_cargo_cross: install_rs_build_toolchain
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked cross
+
 .PHONY: setup_venv # Setup Python virtualenv for wasm tests
 setup_venv:
 	python3 -m venv venv
@@ -244,6 +249,9 @@ install_mlc: install_rs_build_toolchain
 .PHONY: fmt # Format rust code
 fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt

 .PHONY: fmt_js # Format javascript code
 fmt_js: check_nvm_installed
@@ -265,6 +273,9 @@ fmt_c_tests:
 .PHONY: check_fmt # Check rust code format
 check_fmt: install_rs_check_toolchain
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C $(BACKWARD_COMPAT_DATA_DIR) fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C utils/tfhe-lints fmt --check
+	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options -C apps/trivium fmt --check

 .PHONY: check_fmt_c_tests  # Check C tests format
 check_fmt_c_tests:
@@ -290,7 +301,7 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

@@ -301,6 +312,20 @@ check_gpu: install_rs_check_toolchain
 		--all-targets \
 		-p $(TFHE_SPEC)

+.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
+clippy_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
+		--all-targets \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+
+.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
+clippy_gpu_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
+		--all-targets \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -440,6 +465,8 @@ clippy_tfhe_csprng: install_rs_check_toolchain
 clippy_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-zk-pok -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings

 .PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
 clippy_versionable: install_rs_check_toolchain
@@ -459,10 +486,22 @@ clippy_param_dedup: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p param_dedup -- --no-deps -D warnings

+.PHONY: clippy_backward_compat_data # Run clippy lints on tfhe-backward-compat-data
+clippy_backward_compat_data: install_rs_check_toolchain # the toolchain is selected with toolchain.toml
+	@# Some old crates are x86 specific, only run in that case
+	@if uname -a | grep -q x86; then \
+		RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" -Z unstable-options \
+			-C $(BACKWARD_COMPAT_DATA_DIR) clippy --all-targets \
+			-- --no-deps -D warnings; \
+	else \
+		echo "Cannot run clippy for backward compat crate on non x86 platform for now."; \
+	fi
+
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup
+clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup \
+clippy_backward_compat_data

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -473,6 +512,17 @@ clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings

+.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
+clippy_hpu_backend: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-hpu-backend -- --no-deps -D warnings
+
+.PHONY: clippy_hpu_mockup # Run clippy lints on tfhe-hpu-mockup
+clippy_hpu_mockup: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--all-targets \
+		-p tfhe-hpu-mockup -- --no-deps -D warnings
+
 .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
 check_rust_bindings_did_not_change:
 	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -485,6 +535,9 @@ check_rust_bindings_did_not_change:
 tfhe_lints: install_cargo_dylint
 	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe --no-deps -- \
 		--features=boolean,shortint,integer,strings,zk-pok
+	RUSTFLAGS="$(RUSTFLAGS)" cargo dylint --all -p tfhe-zk-pok --no-deps -- \
+		--features=experimental
+

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -626,6 +679,14 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

+.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
+test_integer_gpu_debug: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release_lto_off \
+		--features=integer,gpu-debug -vv -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1 --nocapture
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
+		--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
+
+
 .PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
 test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -634,6 +695,12 @@ test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
 		--tfhe-package "$(TFHE_SPEC)" --backend "gpu"

+.PHONY: test_integer_short_run_gpu # Run the long run integer tests on the gpu backend
+test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
+	TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE \
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture
+
 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -702,6 +769,28 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

+.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
+test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
+	cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
+
+.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
+test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
+	source ./setup_hpu.sh --config sim ; \
+	cargo build --release --bin hpu_mockup; \
+    coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
+	HPU_TEST_ITER=1 \
+	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	kill %1
+
+.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
+test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
+	source ./setup_hpu.sh --config sim ; \
+	cargo build --profile devo --bin hpu_mockup; \
+    coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
+	HPU_TEST_ITER=1 \
+	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	kill %1
+
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -854,9 +943,25 @@ test_high_level_api: install_rs_build_toolchain

 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
 		-E "test(/high_level_api::.*gpu.*/)"

+test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
+ifeq ($(HPU_CONFIG), v80)
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--build-jobs=$(CARGO_BUILD_JOBS) \
+		--test-threads=1 \
+		--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*hpu.*/)"
+else
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--build-jobs=$(CARGO_BUILD_JOBS) \
+		--test-threads=1 \
+		--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*hpu.*/)"
+endif
+
+
 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -874,9 +979,21 @@ test_user_doc: install_rs_build_toolchain
 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
+		--features=internal-keycache,integer,zk-pok,gpu -p $(TFHE_SPEC) \
 		-- test_user_docs::

+.PHONY: test_user_doc_hpu # Run tests for HPU from the .md documentation
+test_user_doc_hpu: install_rs_build_toolchain
+ifeq ($(HPU_CONFIG), v80)
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=internal-keycache,integer,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		-- test_user_docs::
+else
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
+		--features=internal-keycache,integer,hpu -p $(TFHE_SPEC) \
+		-- test_user_docs::
+endif
+


 .PHONY: test_regex_engine # Run tests for regex_engine example
@@ -907,10 +1024,16 @@ test_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		-p tfhe-csprng

+.PHONY: test_tfhe_csprng_big_endian # Run tfhe-csprng tests on an emulated big endian system
+test_tfhe_csprng_big_endian: install_rs_build_toolchain install_cargo_cross
+	RUSTFLAGS="" cross $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
+
+
 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe-zk-pok
+		-p tfhe-zk-pok --features experimental

 .PHONY: test_zk_wasm_x86_compat_ci
 test_zk_wasm_x86_compat_ci: check_nvm_installed
@@ -940,16 +1063,11 @@ test_tfhe_lints: install_cargo_dylint
 # Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
 .PHONY: test_backward_compatibility_ci
 test_backward_compatibility_ci: install_rs_build_toolchain
-	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tests/$(BACKWARD_COMPAT_DATA_DIR)\"" \
+	TFHE_BACKWARD_COMPAT_DATA_DIR="../$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=shortint,integer,zk-pok -p tests test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
-test_backward_compatibility: tests/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
-
-.PHONY: backward_compat_branch # Prints the required backward compatibility branch
-backward_compat_branch:
-	@echo "$(BACKWARD_COMPAT_DATA_BRANCH)"
+test_backward_compatibility: pull_backward_compat_data test_backward_compatibility_ci

 .PHONY: doc # Build rust doc
 doc: install_rs_check_toolchain
@@ -994,6 +1112,10 @@ check_intra_md_links: install_mlc
 check_md_links: install_mlc
 	mlc --match-file-extension tfhe/docs

+.PHONY: check_doc_paths_use_dash # Check paths use "-" instead of "_" in docs for gitbook compatibility
+check_doc_paths_use_dash:
+	python3 ./scripts/check_doc_paths_use_dash.py
+
 .PHONY: check_parameter_export_ok # Checks exported "current" shortint parameter module is correct
 check_parameter_export_ok:
 	python3 ./scripts/check_current_param_export.py
@@ -1012,7 +1134,7 @@ check_compile_tests: install_rs_build_toolchain
 .PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
 check_compile_tests_benches_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=experimental,boolean,shortint,integer,internal-keycache,gpu \
+		--features=experimental,boolean,shortint,integer,internal-keycache,gpu,zk-pok \
 		-p $(TFHE_SPEC)
 	mkdir -p "$(TFHECUDA_BUILD)" && \
 		cd "$(TFHECUDA_BUILD)" && \
@@ -1100,6 +1222,12 @@ clippy_bench_gpu: install_rs_check_toolchain
 		--features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
 		-p tfhe-benchmark -- --no-deps -D warnings

+.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark
+clippy_bench_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--features=hpu,shortint,integer,internal-keycache,pbs-stats\
+		-p tfhe-benchmark -- --no-deps -D warnings
+
 .PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
 print_doc_bench_parameters:
 	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
@@ -1133,6 +1261,14 @@ bench_signed_integer_gpu: install_rs_check_toolchain
 	--bench integer-signed-bench \
 	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

+.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
+bench_integer_hpu: install_rs_check_toolchain
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-bench \
+	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
+
 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1146,7 +1282,7 @@ bench_integer_compression_gpu: install_rs_check_toolchain
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
 	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
-	
+
 .PHONY: bench_integer_zk_gpu
 bench_integer_zk_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1324,11 +1460,33 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --

+.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc20_hpu: install_rs_check_toolchain
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
+
 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench -p tfhe-zk-pok --

+.PHONY: bench_hlapi_noise_squash # Run benchmarks for noise squash operation
+bench_hlapi_noise_squash: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_noise_squash_gpu # Run benchmarks for noise squash operation on GPU
+bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-noise-squash \
+	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
 #
 # Utility tools
 #
@@ -1384,13 +1542,15 @@ parse_wasm_benchmarks: install_rs_check_toolchain
 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
 write_params_to_file: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
-	--example write_params_to_file --features=boolean,shortint,internal-keycache
+	--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache

-.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
-clone_backward_compat_data:
-	./scripts/clone_backward_compat_data.sh $(BACKWARD_COMPAT_DATA_URL) $(BACKWARD_COMPAT_DATA_BRANCH) tests/$(BACKWARD_COMPAT_DATA_DIR)
+.PHONY: pull_backward_compat_data # Pull the data files needed for backward compatibility tests
+pull_backward_compat_data:
+	./scripts/pull_lfs_data.sh $(BACKWARD_COMPAT_DATA_DIR)

-tests/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
+.PHONY: pull_hpu_files # Pull the hpu files
+pull_hpu_files:
+	./scripts/pull_lfs_data.sh backends/tfhe-hpu-backend/

 #
 # Real use case examples
@@ -1416,16 +1576,20 @@ sha256_bool: install_rs_check_toolchain

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links clippy_all check_compile_tests test_tfhe_lints \
+check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
+clippy_all check_compile_tests test_tfhe_lints \
 tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
 clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu

+.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
+pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
+
 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested clippy_fast check_compile_tests
+check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash clippy_fast check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt fmt_js
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@
  <a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
  <a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
  <a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
+  <a href="https://slsa.dev"><img alt="SLSA 3" src="https://slsa.dev/images/gh-badge-level3.svg" /></a>
 </p>

 ## About
@@ -148,7 +149,7 @@ To run this code, use the following command:
 > Note that when running code that uses `TFHE-rs`, it is highly recommended
 to run in release mode with cargo's `--release` flag to have the best performances possible.

-*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
+*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*

 <p align="right">
  <a href="#about" > ↑ Back to top </a>
--- a/_typos.toml
+++ b/_typos.toml
@@ -11,11 +11,13 @@ extend-ignore-identifiers-re = [
    # Example with string replacing "hello" with "herlo"
    "herlo",
    # Example in trivium
-    "C9217BA0D762ACA1"
+    "C9217BA0D762ACA1",
+    "0x[0-9a-fA-F]+"
 ]

 [files]
 extend-exclude = [
    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
+    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
 ]
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -137,10 +137,10 @@ its setup a little more intricate.
 Example code:
 ```rust
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,9 +1,9 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,9 +1,9 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_2::{
-    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::current_params::{
+    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.10.0"
+version = "0.11.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -19,3 +19,4 @@ bindgen = "0.71"
 [features]
 experimental-multi-arch = []
 profile = []
+debug = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -53,6 +53,11 @@ fn main() {
            cmake_config.define("USE_NVTOOLS", "OFF");
        }

+        if cfg!(feature = "debug") {
+            cmake_config.define("CMAKE_BUILD_TYPE", "DEBUG");
+            cmake_config.define("CMAKE_CXX_FLAGS", "-Wuninitialized -O0");
+        }
+
        // Build the CMake project
        let dest = cmake_config.build();
        println!("cargo:rustc-link-search=native={}", dest.display());
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -52,6 +52,8 @@ endif()

 if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Release)
+else()
+  message("Building CUDA backend in ${CMAKE_BUILD_TYPE}")
 endif()

 # Add OpenMP support
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,

 void cuda_improve_noise_modulus_switch_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus);
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus);

 void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -49,12 +49,13 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index);

 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t *size_tracker,
+                                           uint64_t &size_tracker,
                                           bool allocate_gpu_memory);

 void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

 bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
+uint64_t cuda_device_total_memory(uint32_t gpu_index);

 void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
                                                 uint64_t size,
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -24,7 +24,15 @@ using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
      return std::get<Torus *>(variant);                                       \
    }                                                                          \
  }()
-
+// Macro to define the visitor logic using std::holds_alternative for vectors
+#define GET_VARIANT_ELEMENT_64BIT(variant, index)                              \
+  [&] {                                                                        \
+    if (std::holds_alternative<std::vector<uint64_t *>>(variant)) {            \
+      return std::get<std::vector<uint64_t *>>(variant)[index];                \
+    } else {                                                                   \
+      return std::get<uint64_t *>(variant);                                    \
+    }                                                                          \
+  }()
 int get_active_gpu_count(int num_inputs, int gpu_count);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -20,7 +20,7 @@ template <typename Torus> struct int_compression {
                  uint32_t gpu_count, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
                  uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                  uint64_t *size_tracker) {
+                  uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
    this->lwe_per_glwe = lwe_per_glwe;
@@ -38,7 +38,7 @@ template <typename Torus> struct int_compression {
        lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
        gpu_indexes[0], size_tracker, allocate_gpu_memory);

-    *size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
        streams[0], gpu_indexes[0], &fp_ks_buffer,
        compression_params.small_lwe_dimension,
        compression_params.glwe_dimension, compression_params.polynomial_size,
@@ -76,7 +76,7 @@ template <typename Torus> struct int_decompression {
                    int_radix_params compression_params,
                    uint32_t num_radix_blocks, uint32_t body_count,
                    uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                    uint64_t *size_tracker) {
+                    uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -48,6 +48,34 @@ typedef struct {
  uint32_t lwe_dimension;
 } CudaRadixCiphertextFFI;

+typedef struct {
+  uint64_t const *chosen_multiplier_has_at_least_one_set;
+  uint64_t const *decomposed_chosen_multiplier;
+
+  uint32_t const num_scalars;
+  uint32_t const active_bits;
+  uint64_t const shift_pre;
+  uint32_t const shift_post;
+  uint32_t const ilog2_chosen_multiplier;
+  uint32_t const chosen_multiplier_num_bits;
+
+  bool const is_chosen_multiplier_zero;
+  bool const is_abs_chosen_multiplier_one;
+  bool const is_chosen_multiplier_negative;
+  bool const is_chosen_multiplier_pow2;
+  bool const chosen_multiplier_has_more_bits_than_numerator;
+  // if signed: test if chosen_multiplier >= 2^{num_bits - 1}
+  bool const is_chosen_multiplier_geq_two_pow_numerator;
+
+  uint32_t const ilog2_divisor;
+
+  bool const is_divisor_zero;
+  bool const is_abs_divisor_one;
+  bool const is_divisor_negative;
+  bool const is_divisor_pow2;
+  bool const divisor_has_more_bits_than_numerator;
+} CudaScalarDivisorFFI;
+
 uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -395,7 +423,8 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
+    bool allocate_ms_array);

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -414,7 +443,8 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
+    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
+    bool allocate_ms_array);

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -538,5 +568,168 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);

+void extend_radix_with_trivial_zero_blocks_msb_64(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    void *const *streams, uint32_t const *gpu_indexes);
+
+void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              void *const *streams,
+                              uint32_t const *gpu_indexes);
+
+uint64_t scratch_cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t input_glwe_dimension,
+    uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_radix_blocks, uint32_t num_original_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_apply_noise_squashing_kb(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output_radix_lwe,
+    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks);
+
+void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
+                                           uint32_t const *gpu_indexes,
+                                           uint32_t gpu_count,
+                                           int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_sub_and_propagate_single_carry_kb_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
+    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry);
+
+void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
+                                                 uint32_t const *gpu_indexes,
+                                                 uint32_t gpu_count,
+                                                 int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_integer_unsigned_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi);
+
+void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t num_additional_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
+
+void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
+                                                uint32_t const *gpu_indexes,
+                                                uint32_t gpu_count,
+                                                int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *numerator_ct, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
+
+void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
+    bool allocate_ms_array);
+
+void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    uint64_t const *divisor_has_at_least_one_set,
+    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
+    void const *clear_blocks, void const *h_clear_blocks,
+    uint32_t num_clear_blocks);
+
+void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
+    bool allocate_ms_array);
+
+void cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *quotient_ct, CudaRadixCiphertextFFI *remainder_ct,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    const CudaScalarDivisorFFI *scalar_divisor_ffi,
+    uint64_t const *divisor_has_at_least_one_set,
+    uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
+    uint32_t numerator_bits);
+
+void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
@@ -0,0 +1,13 @@
+#ifndef CUDA_BOOTSTRAP_128_H
+#define CUDA_BOOTSTRAP_128_H
+
+#include "pbs_enums.h"
+#include <stdint.h>
+
+uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+#endif // CUDA_BOOTSTRAP_128_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -66,6 +66,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
+uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle(
+    uint32_t polynomial_size);
+template <typename Torus>
 uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
    uint32_t polynomial_size);
 template <typename Torus>
@@ -95,8 +98,12 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(

 template <typename Torus, class params>
 uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                            uint32_t polynomial_size);
-
+                            uint32_t polynomial_size,
+                            uint64_t full_sm_keybundle);
+template <typename Torus, class params>
+uint32_t get_lwe_chunk_size_128(uint32_t gpu_index, uint32_t max_num_pbs,
+                                uint32_t polynomial_size,
+                                uint64_t full_sm_keybundle);
 template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  int8_t *d_mem_keybundle = NULL;
  int8_t *d_mem_acc_step_one = NULL;
@@ -115,7 +122,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             uint64_t *size_tracker) {
+             uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);

@@ -281,4 +288,146 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  }
 };

+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::MULTI_BIT> {
+  int8_t *d_mem_keybundle = NULL;
+  int8_t *d_mem_acc_step_one = NULL;
+  int8_t *d_mem_acc_step_two = NULL;
+  int8_t *d_mem_acc_cg = NULL;
+  int8_t *d_mem_acc_tbc = NULL;
+  uint32_t lwe_chunk_size;
+  double *keybundle_fft;
+  __uint128_t *global_accumulator;
+  double *global_join_buffer;
+
+  PBS_VARIANT pbs_variant;
+  bool gpu_memory_allocated;
+
+  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+                 uint32_t lwe_chunk_size, PBS_VARIANT pbs_variant,
+                 bool allocate_gpu_memory, uint64_t &size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
+    cuda_set_device(gpu_index);
+
+    this->pbs_variant = pbs_variant;
+    this->lwe_chunk_size = lwe_chunk_size;
+    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+
+    // default
+    uint64_t full_sm_keybundle =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_128_keybundle<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_one =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    uint64_t full_sm_accumulate_step_two =
+        get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<
+            __uint128_t>(polynomial_size);
+    uint64_t partial_sm_accumulate_step_one =
+        get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
+            __uint128_t>(polynomial_size);
+    // cg
+    uint64_t full_sm_cg_accumulate =
+        get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<__uint128_t>(
+            polynomial_size);
+    uint64_t partial_sm_cg_accumulate =
+        get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<
+            __uint128_t>(polynomial_size);
+
+    auto num_blocks_keybundle = input_lwe_ciphertext_count * lwe_chunk_size *
+                                (glwe_dimension + 1) * (glwe_dimension + 1) *
+                                level_count;
+    auto num_blocks_acc_step_one =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+    auto num_blocks_acc_step_two =
+        input_lwe_ciphertext_count * (glwe_dimension + 1);
+    auto num_blocks_acc_cg =
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count;
+
+    // Keybundle
+    if (max_shared_memory < full_sm_keybundle)
+      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
+          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
+          size_tracker, allocate_gpu_memory);
+
+    switch (pbs_variant) {
+    case PBS_VARIANT::CG:
+      // Accumulator CG
+      if (max_shared_memory < partial_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      break;
+    case PBS_VARIANT::DEFAULT:
+      // Accumulator step one
+      if (max_shared_memory < partial_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+
+      // Accumulator step two
+      if (max_shared_memory < full_sm_accumulate_step_two)
+        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    keybundle_fft = (double *)cuda_malloc_with_size_tracking_async(
+        num_blocks_keybundle * (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
+        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+            sizeof(__uint128_t),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+            (polynomial_size / 2) * 4 * sizeof(double),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+
+    if (d_mem_keybundle)
+      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
+                                         gpu_memory_allocated);
+    switch (pbs_variant) {
+    case DEFAULT:
+      if (d_mem_acc_step_one)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
+                                           gpu_index, gpu_memory_allocated);
+      if (d_mem_acc_step_two)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
+                                           gpu_index, gpu_memory_allocated);
+      break;
+    case CG:
+      if (d_mem_acc_cg)
+        cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
+                                           gpu_memory_allocated);
+      break;
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
+                                       gpu_memory_allocated);
+  }
+};
+
 #endif // CUDA_MULTI_BIT_UTILITIES_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -90,7 +90,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
             uint32_t glwe_dimension, uint32_t polynomial_size,
             uint32_t level_count, uint32_t input_lwe_ciphertext_count,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             bool allocate_ms_array, uint64_t *size_tracker) {
+             bool allocate_ms_array, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->uses_noise_reduction = allocate_ms_array;
@@ -240,14 +240,16 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  }
 };

-template <PBS_TYPE pbs_type> struct pbs_buffer_128;
+template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer_128;

-template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
+template <typename InputTorus>
+struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
  int8_t *d_mem;

  __uint128_t *global_accumulator;
  double *global_join_buffer;
-  __uint128_t *temp_lwe_array_in;
+  InputTorus *temp_lwe_array_in;
+  uint64_t *trivial_indexes;

  PBS_VARIANT pbs_variant;
  bool uses_noise_reduction;
@@ -258,16 +260,30 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
                 uint32_t polynomial_size, uint32_t level_count,
                 uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
                 bool allocate_gpu_memory, bool allocate_ms_array,
-                 uint64_t *size_tracker) {
+                 uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
    this->uses_noise_reduction = allocate_ms_array;
-    this->temp_lwe_array_in =
-        (__uint128_t *)cuda_malloc_with_size_tracking_async(
-            (lwe_dimension + 1) * input_lwe_ciphertext_count *
-                sizeof(__uint128_t),
-            stream, gpu_index, size_tracker, allocate_ms_array);
+    if (allocate_ms_array) {
+      this->temp_lwe_array_in = (InputTorus *)cuda_malloc_async(
+          (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(InputTorus),
+          stream, gpu_index);
+      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          size_tracker, allocate_ms_array);
+      uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
+      for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
+        h_trivial_indexes[i] = i;
+
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          trivial_indexes, h_trivial_indexes,
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          allocate_gpu_memory);
+
+      cuda_synchronize_stream(stream, gpu_index);
+      delete[] h_trivial_indexes;
+    }
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
    size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
                                     input_lwe_ciphertext_count *
@@ -404,9 +420,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);

-    if (uses_noise_reduction)
+    if (uses_noise_reduction) {
      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
+                                         gpu_memory_allocated);
+    }
  }
 };

@@ -502,7 +521,12 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
                                                    uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
-                                                    uint32_t level_count);
+                                                    uint32_t level_count,
+                                                    uint32_t max_shared_memory);
+
+bool has_support_to_cuda_programmable_bootstrap_128_cg(
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
+    uint32_t num_samples, uint32_t max_shared_memory);

 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -100,7 +100,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
    void const *lut_vector, void const *lwe_array_in,
    void const *bootstrapping_key,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
-    void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
+    void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
    uint32_t level_count, uint32_t num_samples);

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -15,6 +15,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
    uint32_t polynomial_size, uint32_t grouping_factor);

+void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
+    void *stream, uint32_t gpu_index, void *dest, void const *src,
+    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
+    uint32_t polynomial_size, uint32_t grouping_factor);
+
 uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -33,6 +38,25 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                   uint32_t gpu_index,
                                                   int8_t **pbs_buffer);
+
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
+    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t level_count,
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+
+void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_output_indexes, void const *lut_vector,
+    void const *lut_vector_indexes, void const *lwe_array_in,
+    void const *lwe_input_indexes, void const *bootstrapping_key,
+    int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
+    uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
+    uint32_t lut_stride);
+
+void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
+                                                       const uint32_t gpu_index,
+                                                       int8_t **buffer);
 }

 #endif // CUDA_MULTI_BIT_H
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -27,7 +27,7 @@ template <typename Torus> struct zk_expand_mem {
                int_radix_params casting_params, KS_TYPE casting_key_type,
                const uint32_t *num_lwes_per_compact_list,
                const bool *is_boolean_array, uint32_t num_compact_lists,
-                bool allocate_gpu_memory, uint64_t *size_tracker)
+                bool allocate_gpu_memory, uint64_t &size_tracker)
      : computing_params(computing_params), casting_params(casting_params),
        num_compact_lists(num_compact_lists),
        casting_key_type(casting_key_type) {
@@ -112,15 +112,15 @@ template <typename Torus> struct zk_expand_mem {

    // Hint for future readers: if message_modulus == 4 then
    // packed_messages_per_lwe becomes 2
-    auto packed_messages_per_lwe = log2_int(params.message_modulus);
+    auto num_packed_msgs = log2_int(params.message_modulus);

    // Adjust indexes to permute the output and access the correct LUT
    auto h_indexes_in = static_cast<Torus *>(
-        malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_indexes_out = static_cast<Torus *>(
-        malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_lut_indexes = static_cast<Torus *>(
-        malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
+        malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
    auto h_body_id_per_compact_list =
        static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));
    auto h_lwe_compact_input_indexes =
@@ -138,6 +138,10 @@ template <typename Torus> struct zk_expand_mem {
    auto compact_list_id = 0;
    auto idx = 0;
    auto count = 0;
+    // During flatenning, all num_lwes LWEs from all compact lists are stored
+    // sequentially on a Torus array. h_lwe_compact_input_indexes stores the
+    // index of the first LWE related to the compact list that contains the i-th
+    // LWE
    for (int i = 0; i < num_lwes; i++) {
      h_lwe_compact_input_indexes[i] = idx;
      count++;
@@ -148,6 +152,8 @@ template <typename Torus> struct zk_expand_mem {
      }
    }

+    // Stores the index of the i-th LWE (within each compact list) related to
+    // the k-th compact list.
    auto offset = 0;
    for (int k = 0; k < num_compact_lists; k++) {
      auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
@@ -159,46 +165,75 @@ template <typename Torus> struct zk_expand_mem {
      offset += num_lwes_in_kth_compact_list;
    }

+    /*
+     * Each LWE contains encrypted data in both carry and message spaces
+     * that needs to be extracted.
+     *
+     * The loop processes each compact list (k) and for each LWE within that
+     * list:
+     * 1. Sets input indexes to read each LWE twice (for carry and message
+     * extraction)
+     * 2. Creates output indexes to properly reorder the results
+     * 3. Selects appropriate LUT index based on whether boolean sanitization is
+     * needed
+     *
+     * We want the output to have always first the content of the message part
+     * and then the content of the carry part of each LWE.
+     *
+     * i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
+     * carry_extract(LWE_1), ...
+     *
+     * Aiming that behavior, with 4 LWEs we would have:
+     *
+     * // Each LWE is processed twice
+     * h_indexes_in   = {0, 1, 2, 3, 0, 1, 2, 3}
+     *
+     * // First 4 use message LUT, last 4 use carry LUT
+     * h_lut_indexes  = {0, 0, 0, 0, 1, 1, 1, 1}
+     *
+     * // Reorders output so message and carry for each LWE appear together
+     * h_indexes_out  = {0, 2, 4, 6, 1, 3, 5, 7}
+     *
+     * If an LWE contains a boolean value, its LUT index is shifted by
+     * num_packed_msgs to use the sanitization LUT (which ensures output is
+     * exactly 0 or 1).
+     */
    offset = 0;
    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
-      for (int i = 0;
-           i < packed_messages_per_lwe * num_lwes_in_kth_compact_list; i++) {
-        Torus j = i % num_lwes_in_kth_compact_list;
-        h_indexes_in[i + packed_messages_per_lwe * offset] = j + offset;
-        h_indexes_out[i + packed_messages_per_lwe * offset] =
-            packed_messages_per_lwe * (j + offset) +
-            (i / num_lwes_in_kth_compact_list);
+      auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+      for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
+        auto lwe_index = i + num_packed_msgs * offset;
+        auto lwe_index_in_list = i % num_lwes_in_kth;
+        h_indexes_in[lwe_index] = lwe_index_in_list + offset;
+        h_indexes_out[lwe_index] =
+            num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
        // If the input relates to a boolean, shift the LUT so the correct one
        // with sanitization is used
-        h_lut_indexes[i + packed_messages_per_lwe * offset] =
-            (is_boolean_array[h_indexes_out[i +
-                                            packed_messages_per_lwe * offset]]
-                 ? packed_messages_per_lwe
-                 : 0) +
-            i / num_lwes_in_kth_compact_list;
+        auto boolean_offset =
+            is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
+        h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
      }
-      offset += num_lwes_in_kth_compact_list;
+      offset += num_lwes_in_kth;
    }

    message_and_carry_extract_luts->set_lwe_indexes(
        streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
    auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
-    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);

    cuda_memcpy_with_size_tracking_async_to_gpu(
        d_lwe_compact_input_indexes, h_lwe_compact_input_indexes,
        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
        allocate_gpu_memory);
    cuda_memcpy_with_size_tracking_async_to_gpu(
-        lut_indexes, h_lut_indexes,
-        packed_messages_per_lwe * num_lwes * sizeof(Torus), streams[0],
-        gpu_indexes[0], allocate_gpu_memory);
+        lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
+        streams[0], gpu_indexes[0], allocate_gpu_memory);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        d_body_id_per_compact_list, h_body_id_per_compact_list,
        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
        allocate_gpu_memory);

+    message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);
+
    // The expanded LWEs will always be on the casting key format
    tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
        num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB_RECURSE SOURCES "*.cu")
-add_library(tfhe_cuda_backend STATIC ${SOURCES})
+add_library(tfhe_cuda_backend STATIC ${SOURCES} pbs/programmable_bootstrap_multibit_128.cuh
+                                     pbs/programmable_bootstrap_multibit_128.cu)
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
 target_include_directories(tfhe_cuda_backend PRIVATE .)
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -84,15 +84,19 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
      static_cast<uint64_t *>(lwe_array_out), size, log_modulus);
 }

+// This end point is used only for testing purposes
+// its output always follows trivial ordering
 void cuda_improve_noise_modulus_switch_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus) {
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus) {
  host_improve_noise_modulus_switch<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t const *>(lwe_array_in),
+      static_cast<uint64_t const *>(lwe_array_indexes),
      static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
      num_zeros, input_variance, r_sigma, bound, log_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -38,6 +38,16 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
 // Each thread in x are used to calculate one output.
 // threads in y are used to paralelize the lwe_dimension_in loop.
 // shared memory is used to store intermediate results of the reduction.
+// Note: To reduce register pressure we have slightly changed the algorithm,
+// the idea consists in calculating the negate value of the output. So, instead
+// of accumulating subtractions using -=, we accumulate additions using += in
+// the local_lwe_out. This seems to work better cause profits madd ops and save
+// some regs. For this to work, we need to negate the input
+// lwe_array_in[lwe_dimension_in], and negate back the output at the end to get
+// the correct results. Additionally, we split the calculation of the ksk offset
+// in two parts, a constant part is calculated before the loop, and a variable
+// part is calculated inside the loop. This seems to help with the register
+// pressure as well.
 template <typename Torus>
 __global__ void
 keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
@@ -60,7 +70,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
        lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);

    if (tid == lwe_dimension_out && threadIdx.y == 0) {
-      local_lwe_out = block_lwe_array_in[lwe_dimension_in];
+      local_lwe_out = -block_lwe_array_in[lwe_dimension_in];
    }
    const Torus mask_mod_b = (1ll << base_log) - 1ll;

@@ -73,12 +83,12 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
    for (int i = start_i; i < end_i; i++) {
      Torus state =
          init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
-
+      uint32_t offset = i * level_count * (lwe_dimension_out + 1);
      for (int j = 0; j < level_count; j++) {
-        auto ksk_block =
-            get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
+
        Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-        local_lwe_out -= (Torus)ksk_block[tid] * decomposed;
+        local_lwe_out +=
+            (Torus)ksk[tid + j * (lwe_dimension_out + 1) + offset] * decomposed;
      }
    }

@@ -93,7 +103,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
          lwe_acc_out[shmem_index + offset * blockDim.x];
    }
    if (threadIdx.y == 0)
-      block_lwe_array_out[tid] = lwe_acc_out[shmem_index];
+      block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
  }
 }

@@ -172,14 +182,14 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(

  // allocate at least LWE-mask times two: to keep both decomposition state and
  // decomposed intermediate value
-  int memory_unit = glwe_accumulator_size > lwe_dimension * 2
-                        ? glwe_accumulator_size
-                        : lwe_dimension * 2;
+  uint64_t memory_unit = glwe_accumulator_size > lwe_dimension * 2
+                             ? glwe_accumulator_size
+                             : lwe_dimension * 2;

-  uint64_t size_tracker;
+  uint64_t size_tracker = 0;
+  uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
  *fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
-      2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index,
-      &size_tracker, allocate_gpu_memory);
+      buffer_size, stream, gpu_index, size_tracker, allocate_gpu_memory);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -66,6 +66,13 @@ __device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
  r = __ll2double_rn(x);
 }

+template <>
+__device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
+                                                             double &r) {
+  // We truncate x
+  r = __ll2double_rn(static_cast<uint64_t>(x));
+}
+
 template <typename T>
 __device__ inline T init_decomposer_state(T input, uint32_t base_log,
                                          uint32_t level_count) {
@@ -178,11 +185,12 @@ __device__ __forceinline__ double measure_modulus_switch_noise(

 // Each thread processes two elements of the lwe array
 template <typename Torus>
-__global__ void
-improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
-                             const Torus *zeros, int lwe_size, int num_zeros,
-                             double input_variance, double r_sigma,
-                             double bound, uint32_t log_modulus) {
+__global__ void __launch_bounds__(512)
+    improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
+                                 const uint64_t *indexes, const Torus *zeros,
+                                 int lwe_size, int num_zeros,
+                                 double input_variance, double r_sigma,
+                                 double bound, uint32_t log_modulus) {

  // First we will assume size is less than the number of threads per block
  // I should switch this to dynamic shared memory
@@ -198,13 +206,14 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
  // This probably are not needed cause we are setting the values
  sum_mask_errors[threadIdx.x] = 0.f;
  sum_squared_mask_errors[threadIdx.x] = 0.f;
+  auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
+  // We use modulus switch to gather the output in trivial order
+  auto this_block_lwe_out = array_out + blockIdx.x * lwe_size;
+  Torus input_element1 = this_block_lwe_in[threadIdx.x];

-  Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
-
-  Torus input_element2 =
-      threadIdx.x + blockDim.x < lwe_size
-          ? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
-          : 0;
+  Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
+                             ? this_block_lwe_in[threadIdx.x + blockDim.x]
+                             : 0;

  // Base noise is only handled by thread 0
  double base_noise = measure_modulus_switch_noise<Torus>(
@@ -218,11 +227,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
  __syncthreads();

  if (found)
-    array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
+    this_block_lwe_out[threadIdx.x] = input_element1;

  if (found && (threadIdx.x + blockDim.x) < lwe_size)
-    array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-        input_element2;
+    this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;

  __syncthreads();
  // If we found a zero element we stop iterating (in avg 20 times are
@@ -253,11 +261,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
    // Assumption we always have at least 512 elements
    // If we find a useful zero encryption we replace the lwe by lwe + zero
    if (found)
-      array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
+      this_block_lwe_out[threadIdx.x] = zero_element1;

    if (found && (threadIdx.x + blockDim.x) < lwe_size)
-      array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-          zero_element2;
+      this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;

    __syncthreads();
    // If we found a zero element we stop iterating (in avg 20 times are
@@ -270,9 +277,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
 template <typename Torus>
 __host__ void host_improve_noise_modulus_switch(
    cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
-    Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
-    uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
-    const double r_sigma, const double bound, uint32_t log_modulus) {
+    Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
+    uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
+    const double input_variance, const double r_sigma, const double bound,
+    uint32_t log_modulus) {

  if (lwe_size < 512) {
    PANIC("The lwe_size is less than 512, this is not supported\n");
@@ -289,8 +297,8 @@ __host__ void host_improve_noise_modulus_switch(
  int num_threads = 512, num_blocks = num_lwes;

  improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
-      array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
-      bound, log_modulus);
+      array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
+      r_sigma, bound, log_modulus);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -74,10 +74,9 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
                                           uint32_t gpu_index,
-                                           uint64_t *size_tracker,
+                                           uint64_t &size_tracker,
                                           bool allocate_gpu_memory) {
-  if (size_tracker != nullptr)
-    *size_tracker += size;
+  size_tracker += size;
  void *ptr = nullptr;
  if (!allocate_gpu_memory)
    return ptr;
@@ -106,8 +105,9 @@ void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                        uint32_t gpu_index) {
-  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index, nullptr,
-                                              true);
+  uint64_t size_tracker = 0;
+  return cuda_malloc_with_size_tracking_async(size, stream, gpu_index,
+                                              size_tracker, true);
 }

 /// Check that allocation is valid
@@ -122,6 +122,13 @@ bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
  }
 }

+uint64_t cuda_device_total_memory(uint32_t gpu_index) {
+  cuda_set_device(gpu_index);
+  size_t total_mem = 0, free_mem = 0;
+  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
+  return total_mem;
+}
+
 /// Returns
 ///  false if Cooperative Groups is not supported.
 ///  true otherwise
--- a/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
@@ -234,6 +234,29 @@ __device__ void convert_u128_to_f128_as_torus(
  }
 }

+// params is expected to be full degree not half degree
+// same as convert_u128_to_f128_as_torus() but expects input to be on registers
+template <class params>
+__device__ void convert_u128_on_regs_to_f128_as_torus(
+    double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
+    const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
+
+  const double normalization = pow(2., -128.);
+  Index tid = threadIdx.x;
+  // #pragma unroll
+  for (Index i = 0; i < params::opt / 2; i++) {
+    auto out_re = u128_to_signed_to_f128(in_re_on_regs[i]);
+    auto out_im = u128_to_signed_to_f128(in_im_on_regs[i]);
+
+    out_re_hi[tid] = out_re.hi * normalization;
+    out_re_lo[tid] = out_re.lo * normalization;
+    out_im_hi[tid] = out_im.hi * normalization;
+    out_im_lo[tid] = out_im.lo * normalization;
+
+    tid += params::degree / params::opt;
+  }
+}
+
 template <class params>
 __device__ void
 convert_f128_to_u128_as_torus(__uint128_t *out_re, __uint128_t *out_im,
@@ -272,7 +295,7 @@ batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_hi_1, ... re_hi_n, re_lo_0, re_lo_1,
 // ... re_lo_n, im_hi_0, im_hi_1, ..., im_hi_n,  im_lo_0, im_lo_1, ..., im_lo_n]
 template <class params>
@@ -291,7 +314,7 @@ batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
 }

 // params is expected to be full degree not half degree
-// converts standqard input into complex<128> represented by 4 double
+// converts standard input into complex<128> represented by 4 double
 // with following pattern: [re_hi_0, re_lo_0, im_hi_0, im_lo_0, re_hi_1,
 // re_lo_1, im_hi_1, im_lo_1,
 // ...,re_hi_n, re_lo_n, im_hi_n, im_lo_n, ]
@@ -492,6 +515,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
  batch_convert_u128_to_f128_as_integer<params>
      <<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
                                             d_standard);
+  check_cuda_error(cudaGetLastError());

  // call negacyclic 128 bit forward fft.
  if (full_sm) {
@@ -503,6 +527,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
        <<<grid_size, block_size, shared_memory_size, stream>>>(
            d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
  }
+  check_cuda_error(cudaGetLastError());

  cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
                           gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -26,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
  if (is_signed) {
    *mem_ptr = new int_abs_buffer<Torus>(streams, gpu_indexes, gpu_count,
                                         params, num_blocks,
-                                         allocate_gpu_memory, &size_tracker);
+                                         allocate_gpu_memory, size_tracker);
  }
  return size_tracker;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -63,7 +63,7 @@ void update_degrees_after_bitor(uint64_t *output_degrees,
    auto result = max;

    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
+      if ((max | j) > result) {
        result = max | j;
      }
    }
@@ -82,7 +82,7 @@ void update_degrees_after_bitxor(uint64_t *output_degrees,

    // Try every possibility to find the worst case
    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
+      if ((max ^ j) > result) {
        result = max ^ j;
      }
    }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -36,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
    update_degrees_after_bitor(degrees, lwe_array_1->degrees,
                               lwe_array_2->degrees,
                               lwe_array_1->num_radix_blocks);
-  } else if (mem_ptr->op == BITXOR) {
+  } else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
    update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
                                lwe_array_2->degrees,
                                lwe_array_1->num_radix_blocks);
@@ -61,7 +61,7 @@ __host__ uint64_t scratch_cuda_integer_radix_bitop_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_bitop_buffer<Torus>(streams, gpu_indexes, gpu_count, op,
                                         params, num_radix_blocks,
-                                         allocate_gpu_memory, &size_tracker);
+                                         allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -0,0 +1,62 @@
+#include "cast.cuh"
+
+void extend_radix_with_trivial_zero_blocks_msb_64(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    void *const *streams, uint32_t const *gpu_indexes) {
+  host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
+      output, input, (cudaStream_t *)streams, gpu_indexes);
+}
+
+void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
+                              CudaRadixCiphertextFFI const *input,
+                              void *const *streams,
+                              uint32_t const *gpu_indexes) {
+
+  host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
+                                       gpu_indexes);
+}
+
+uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t num_additional_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, bool allocate_ms_array) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus,
+                          allocate_ms_array);
+
+  return scratch_extend_radix_with_sign_msb<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count,
+      (int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
+      num_blocks, num_additional_blocks, allocate_gpu_memory);
+}
+
+void cuda_extend_radix_with_sign_msb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
+
+  host_extend_radix_with_sign_msb<uint64_t>(
+      (cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
+      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
+      num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
+}
+
+void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
+                                                uint32_t const *gpu_indexes,
+                                                uint32_t gpu_count,
+                                                int8_t **mem_ptr_void) {
+
+  int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
+      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
+
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -0,0 +1,94 @@
+#ifndef CAST_CUH
+#define CAST_CUH
+
+#include "device.h"
+#include "integer.cuh"
+#include "integer/integer_utilities.h"
+
+template <typename Torus>
+__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
+    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
+    cudaStream_t const *streams, uint32_t const *gpu_indexes) {
+  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
+                                           0, input->num_radix_blocks, input, 0,
+                                           input->num_radix_blocks);
+}
+
+template <typename Torus>
+__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
+                                         CudaRadixCiphertextFFI const *input,
+                                         cudaStream_t const *streams,
+                                         uint32_t const *gpu_indexes) {
+
+  const uint32_t input_start_lwe_index =
+      input->num_radix_blocks - output->num_radix_blocks;
+
+  if (input->num_radix_blocks <= output->num_radix_blocks) {
+    PANIC("Cuda error: input num blocks should be greater than output num "
+          "blocks");
+  }
+
+  copy_radix_ciphertext_slice_async<Torus>(
+      streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
+      input_start_lwe_index, input->num_radix_blocks);
+}
+
+template <typename Torus>
+__host__ uint64_t scratch_extend_radix_with_sign_msb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
+    const int_radix_params params, uint32_t num_radix_blocks,
+    uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
+
+  uint64_t size_tracker = 0;
+
+  *mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
+      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
+      num_additional_blocks, allocate_gpu_memory, size_tracker);
+
+  return size_tracker;
+}
+
+template <typename Torus>
+__host__ void host_extend_radix_with_sign_msb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
+    CudaRadixCiphertextFFI const *input,
+    int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
+    uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
+
+  if (num_additional_blocks == 0) {
+    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
+                                       input);
+    return;
+  }
+
+  const uint32_t input_blocks = input->num_radix_blocks;
+
+  if (input_blocks == 0) {
+    PANIC("Cuda error: input blocks cannot be zero");
+  }
+
+  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
+                                           0, input_blocks, input, 0,
+                                           input_blocks);
+
+  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
+                                           mem_ptr->last_block, 0, 1, input,
+                                           input_blocks - 1, input_blocks);
+
+  host_apply_univariate_lut_kb(
+      streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
+      mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
+
+  for (uint32_t i = 0; i < num_additional_blocks; ++i) {
+    uint32_t dst_block_idx = input_blocks + i;
+
+    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
+                                             dst_block_idx, dst_block_idx + 1,
+                                             mem_ptr->padding_block, 0, 1);
+  }
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -100,7 +100,7 @@ __host__ uint64_t scratch_cuda_integer_radix_cmux_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_cmux_buffer<Torus>(
      streams, gpu_indexes, gpu_count, predicate_lut_f, params,
-      num_radix_blocks, allocate_gpu_memory, &size_tracker);
+      num_radix_blocks, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -456,7 +456,7 @@ __host__ void tree_sign_reduction(
  auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
  while (partial_block_count > 2) {
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
-                       4);
+                       message_modulus);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -477,16 +477,17 @@ __host__ void tree_sign_reduction(
  auto last_lut = tree_buffer->tree_last_leaf_lut;
  auto block_selector_f = tree_buffer->block_selector_f;
  std::function<Torus(Torus)> f;
-
+  auto num_bits_in_message = log2_int(params.message_modulus);
  if (partial_block_count == 2) {
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
-                       4);
+                       message_modulus);

-    f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
-      int msb = (x >> 2) & 3;
-      int lsb = x & 3;
+    f = [block_selector_f, sign_handler_f, num_bits_in_message,
+         message_modulus](Torus x) -> Torus {
+      Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
+      Torus lsb = x & (message_modulus - 1);

-      int final_sign = block_selector_f(msb, lsb);
+      Torus final_sign = block_selector_f(msb, lsb);
      return sign_handler_f(final_sign);
    };
  } else {
@@ -683,7 +684,7 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_comparison_buffer<Torus>(
      streams, gpu_indexes, gpu_count, op, params, num_radix_blocks, is_signed,
-      allocate_gpu_memory, &size_tracker);
+      allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -370,7 +370,7 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_compression<Torus>(
      streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
-      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, &size_tracker);
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -386,7 +386,7 @@ __host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
  *mem_ptr = new int_decompression<Torus>(
      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
      num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
-      &size_tracker);
+      size_tracker);
  return size_tracker;
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -28,7 +28,7 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
  uint64_t size_tracker = 0;
  *mem_ptr = new int_div_rem_memory<Torus>(streams, gpu_indexes, gpu_count,
                                           params, is_signed, num_blocks,
-                                           allocate_gpu_memory, &size_tracker);
+                                           allocate_gpu_memory, size_tracker);
  return size_tracker;
 }

@@ -386,8 +386,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
                         subtraction_overflowed,
                         at_least_one_upper_block_is_non_zero, 1);

-    int factor = (i) ? 3 : 2;
-    int factor_lut_id = factor - 2;
+    auto message_modulus = radix_params.message_modulus;
+    int factor = (i) ? message_modulus - 1 : message_modulus - 2;
+    int factor_lut_id = (i) ? 1 : 0;
    for (size_t k = 0;
         k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
      copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
--- a/Show More
+++ b/Show More