refactor(gpu): refactor and optimize sum_ciphertext in cuda backend

fix(gpu): add indexes to modulus switch noise reduction
fix(gpu): allow to build with hpu feature enabled
2026-01-13 00:28:24 -05:00 · 2025-05-22 16:57:23 +04:00 · 2025-05-22 10:50:51 +02:00 · 2025-05-22 10:21:35 +02:00 · 2025-05-21 18:06:58 +01:00 · 2025-05-21 18:06:58 +01:00
1011 changed files with 89762 additions and 21485 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -6,6 +6,7 @@ self-hosted-runner:
    - large_windows_16_latest
    - large_ubuntu_16
    - large_ubuntu_16-22.04
+    - v80-desktop
 # Configuration variables in array of strings defined in your repository or
 # organization. `null` means disabling configuration variables check.
 # Empty array means no configuration variable is allowed.
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -8,9 +8,6 @@ inputs:
  gcc-version:
    description: Version of GCC to use
    required: true
-  cmake-version:
-    description: Version of cmake to use
-    default: 3.29.6
  github-instance:
    description: Instance is hosted on GitHub
    default: 'false'
@@ -22,41 +19,58 @@ runs:
    - name: Install dependencies
      shell: bash
      run: |
+        wget https://github.com/Kitware/CMake/releases/download/v"${CMAKE_VERSION}"/cmake-"${CMAKE_VERSION}"-linux-x86_64.sh
+        echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
+        sha256sum -c checksum
+        sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
        sudo apt update
-        curl -fsSL https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo gpg --dearmour -o /etc/apt/trusted.gpg.d/kitware.gpg
-        sudo chmod 644 /etc/apt/trusted.gpg.d/kitware.gpg
-        echo 'deb [signed-by=/etc/apt/trusted.gpg.d/kitware.gpg] https://apt.kitware.com/ubuntu/ jammy main' | sudo tee /etc/apt/sources.list.d/kitware.list >/dev/null
-        sudo apt update
-        sudo apt install -y cmake cmake-format libclang-dev
+        sudo apt install -y cmake-format libclang-dev
+      env:
+        CMAKE_VERSION: 3.29.6
+        CMAKE_SCRIPT_SHA: "6e4fada5cba3472ae503a11232b6580786802f0879cead2741672bf65d97488a"

    - name: Install CUDA
      if: inputs.github-instance == 'true'
      shell: bash
      run: |
-        TOOLKIT_VERSION="$(echo ${{ inputs.cuda-version }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+        TOOLKIT_VERSION="$(echo ${CUDA_VERSION} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${env.CUDA_KEYRING_PACKAGE}
+        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
+        sha256sum -c checksum
+        sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
        sudo apt update
-        sudo apt -y install cuda-toolkit-${TOOLKIT_VERSION}
+        sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"
+      env:
+        CUDA_VERSION: ${{ inputs.cuda-version }}
+        CUDA_KEYRING_PACKAGE: cuda-keyring_1.1-1_all.deb
+        CUDA_KEYRING_SHA: "d93190d50b98ad4699ff40f4f7af50f16a76dac3bb8da1eaaf366d47898ff8df"

    - name: Export CUDA variables
      shell: bash
      run: |
-        CUDA_PATH=/usr/local/cuda-${{ inputs.cuda-version }}
-        echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-        echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-        echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-        echo "CUDA_MODULE_LOADER=EAGER" >> "${GITHUB_ENV}"
+        CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
+        {
+          echo "CUDA_PATH=$CUDA_PATH";
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH";
+          echo "CUDA_MODULE_LOADER=EAGER";
+        } >> "${GITHUB_ENV}"
+        {
+          echo "PATH=$PATH:$CUDA_PATH/bin"; 
+        } >> "${GITHUB_PATH}"
+      env:
+        CUDA_VERSION: ${{ inputs.cuda-version }}

    # Specify the correct host compilers
    - name: Export gcc and g++ variables
      shell: bash
      run: |
        {
-          echo "CC=/usr/bin/gcc-${{ inputs.gcc-version }}";
-          echo "CXX=/usr/bin/g++-${{ inputs.gcc-version }}";
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ inputs.gcc-version }}";
+          echo "CC=/usr/bin/gcc-${GCC_VERSION}";
+          echo "CXX=/usr/bin/g++-${GCC_VERSION}";
+          echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
        } >> "${GITHUB_ENV}"
+      env:
+        GCC_VERSION: ${{ inputs.gcc-version }}

    - name: Check device is detected
      shell: bash
--- a/.github/workflows/approve_label.yml
+++ b/.github/workflows/approve_label.yml
@@ -6,6 +6,9 @@ on:
  pull_request_review:
    types: [submitted]

+
+permissions: {}
+
 jobs:
  trigger-tests:
    runs-on: ubuntu-latest
@@ -34,3 +37,10 @@ jobs:
          # We need to use a PAT to be able to trigger `labeled` event for the other workflow.
          github_token: ${{ secrets.FHE_ACTIONS_TOKEN }}
          labels: approved
+
+      - name: Check if maintainer needs to handle label manually
+        if: ${{ failure() }}
+        run: |
+          echo "Pull-request from an external contributor."
+          echo "A maintainer need to manually add/remove the 'approved' label."
+          exit 1
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -23,6 +23,9 @@ on:
  workflow_dispatch:
  pull_request:

+permissions:
+  contents: read
+
 jobs:
  setup-instance:
    name: Setup instance (backward-compat-tests)
@@ -47,7 +50,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  backward-compat-tests:
    name: Backward compatibility tests
@@ -71,7 +74,7 @@ jobs:
      - name: Use specific data branch
        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
        env:
-          PR_BRANCH: ${{ github.ref_name }}
+          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
        run: |
          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"

@@ -83,11 +86,12 @@ jobs:

      - name: Get backward compat branch head SHA
        id: backward_compat_sha
+        run: |
+          SHA=$(git ls-remote "${REPO_URL}" refs/heads/"${BACKWARD_COMPAT_BRANCH}" | awk '{print $1}')
+          echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
        env:
          REPO_URL: "https://github.com/zama-ai/tfhe-backward-compat-data"
-        run: |
-          SHA=$(git ls-remote ${{ env.REPO_URL }} refs/heads/${{ steps.backward_compat_branch.outputs.branch }} | awk '{print $1}')
-          echo "sha=${SHA}" >> "${GITHUB_OUTPUT}"
+          BACKWARD_COMPAT_BRANCH: ${{ steps.backward_compat_branch.outputs.branch }}

      - name: Retrieve data from cache
        id: retrieve-data-cache
@@ -101,6 +105,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
          repository: zama-ai/tfhe-backward-compat-data
          path: tests/tfhe-backward-compat-data
          lfs: 'true'
@@ -121,12 +126,14 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -151,7 +158,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -24,6 +24,9 @@ on:
  workflow_dispatch:
  pull_request:

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -65,7 +68,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            dependencies:
@@ -154,7 +157,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  fast-tests:
    name: Fast CPU tests
@@ -269,12 +272,14 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -297,9 +302,9 @@ jobs:
          label: ${{ needs.setup-instance.outputs.runner-name }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (fast-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -30,6 +30,9 @@ on:
    branches:
      - main

+permissions:
+  contents: read
+
 jobs:
  should-run:
    if:
@@ -52,7 +55,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            integer:
@@ -94,7 +97,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  unsigned-integer-tests:
    name: Unsigned integer tests
@@ -134,17 +137,19 @@ jobs:

      - name: Run unsigned integer tests
        run: |
-          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci
+          AVX512_SUPPORT=ON NO_BIG_PARAMS="${NO_BIG_PARAMS}" BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci

      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -169,7 +174,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -30,6 +30,9 @@ on:
    branches:
      - main

+permissions:
+  contents: read
+
 jobs:
  should-run:
    if:
@@ -53,7 +56,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            integer:
@@ -95,7 +98,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  signed-integer-tests:
    name: Signed integer tests
@@ -139,17 +142,19 @@ jobs:

      - name: Run signed integer tests
        run: |
-          AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci
+          AVX512_SUPPORT=ON NO_BIG_PARAMS="${NO_BIG_PARAMS}" BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci

      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -174,7 +179,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -27,6 +27,9 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -74,7 +77,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            dependencies:
@@ -163,7 +166,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cpu-tests:
    name: CPU tests
@@ -251,12 +254,14 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -281,7 +286,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -23,6 +23,9 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read
+
 jobs:
  setup-instance:
    name: Setup instance (wasm-tests)
@@ -48,7 +51,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  wasm-tests:
    name: WASM tests
@@ -120,12 +123,14 @@ jobs:
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -150,7 +155,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -18,6 +18,9 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+
+permissions: {}
+
 jobs:
  setup-instance:
    name: Setup instance (boolean-benchmarks)
@@ -45,7 +48,6 @@ jobs:
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    continue-on-error: true
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -73,15 +75,17 @@ jobs:

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Measure key sizes
        run: |
@@ -89,7 +93,7 @@ jobs:

      - name: Parse key sizes results
        run: |
-          python3 ./ci/benchmark_parser.py tfhe/boolean_key_sizes.csv ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/boolean_key_sizes.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

@@ -110,13 +114,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Boolean benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -140,7 +144,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (boolean-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -18,6 +18,9 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+
+permissions: {}
+
 jobs:
  setup-instance:
    name: Setup instance (core-crypto-benchmarks)
@@ -75,15 +78,17 @@ jobs:

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --name-suffix avx512 \
          --walk-subdirs
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -102,13 +107,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "PBS benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -132,7 +137,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -0,0 +1,152 @@
+# Run all DEX benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: DEX benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 5a.m.
+    - cron: '0 5 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+jobs:
+  setup-instance:
+    name: Setup instance (dex-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  dex-benchmarks:
+    name: Execute DEX benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    timeout-minutes: 720  # 12 hours
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_hlapi_dex
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs \
+          --name-suffix avx512
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Parse swap request PBS counts
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
+          --object-sizes \
+          --append-results
+
+      - name: Parse swap claim PBS counts
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
+          --object-sizes \
+          --append-results
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_dex
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "DEX benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (dex-benchmarks)
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, dex-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (dex-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -18,6 +18,9 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+
+permissions: {}
+
 jobs:
  setup-instance:
    name: Setup instance (erc20-benchmarks)
@@ -45,7 +48,6 @@ jobs:
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    continue-on-error: true
    timeout-minutes: 720  # 12 hours
    steps:
      - name: Checkout tfhe-rs repo with tags
@@ -82,19 +84,21 @@ jobs:

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Parse PBS counts
        run: |
-          python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/erc20_pbs_count.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

@@ -107,13 +111,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "ERC20 benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -137,7 +141,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (erc20-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu.yml
+++ b/.github/workflows/benchmark_gpu.yml
@@ -10,13 +10,14 @@ on:
        type: choice
        options:
          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
          - "4-h100 (n3-H100x4)"
          - "multi-h100 (n3-H100x8)"
          - "multi-h100-nvlink (n3-H100x8-NVLink)"
          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
-          - "multi-a100-nvlink (n3-A100x8-NVLink)"
      command:
        description: "Benchmark command to run"
        type: choice
@@ -28,6 +29,8 @@ on:
          - pbs
          - pbs128
          - ks
+          - ks_pbs
+          - integer_zk
      op_flavor:
        description: "Operations set to run"
        type: choice
@@ -48,6 +51,17 @@ on:
          - latency
          - throughput
          - both
+      params_type:
+        description: "Parameters type"
+        type: choice
+        default: multi_bit
+        options:
+          - classical
+          - multi_bit
+          - both
+
+
+permissions: {}

 jobs:
  parse-inputs:
@@ -55,16 +69,24 @@ jobs:
    outputs:
      profile: ${{ steps.parse_profile.outputs.profile }}
      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.profile }}
    steps:
      - name: Parse profile
        id: parse_profile
        run: |
-          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"

      - name: Parse hardware name
        id: parse_hardware_name
        run: |
-          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"

  run-benchmarks:
    name: Run benchmarks
@@ -76,5 +98,14 @@ jobs:
      command: ${{ inputs.command }}
      op_flavor: ${{ inputs.op_flavor }}
      bench_type: ${{ inputs.bench_type }}
+      params_type: ${{ inputs.params_type }}
      all_precisions: ${{ inputs.all_precisions }}
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -22,6 +22,9 @@ on:
    # Weekly benchmarks will be triggered each Friday at 9p.m.
    - cron: "0 21 * * 5"

+permissions:
+  contents: read
+
 jobs:
  cuda-integer-benchmarks:
    name: Cuda integer benchmarks (RTX 4090)
@@ -69,15 +72,17 @@ jobs:

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "rtx4090" \
          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -88,13 +93,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -145,14 +150,14 @@ jobs:

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "rtx4090" \
          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
      

@@ -165,21 +170,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on results file"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -18,10 +18,13 @@ on:
        required: true
      op_flavor: # Use a comma separated values to generate an array
        type: string
-        required: true
+        default: default
      bench_type:
        type: string
        default: latency
+      params_type:
+        type: string
+        default: multi_bit
      all_precisions:
        type: boolean
        default: false
@@ -55,6 +58,9 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE

+
+permissions: {}
+
 jobs:
  prepare-matrix:
    name: Prepare operations matrix
@@ -63,36 +69,57 @@ jobs:
      command: ${{ steps.set_command.outputs.command }}
      op_flavor: ${{ steps.set_op_flavor.outputs.op_flavor }}
      bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
+      params_type: ${{ steps.set_params_type.outputs.params_type }}
+    env:
+      INPUTS_COMMAND: ${{ inputs.command }}
+      INPUTS_OP_FLAVOR: ${{ inputs.op_flavor }}
    steps:
      - name: Set single command
        if: ${{ !contains(inputs.command, ',')}}
        run: |
-          echo "COMMAND=[\"${{ inputs.command }}\"]" >> "${GITHUB_ENV}"
+          echo "COMMAND=[\"${INPUTS_COMMAND}\"]" >> "${GITHUB_ENV}"

      - name: Set multiple commands
        if: ${{ contains(inputs.command, ',')}}
        run: |
-          PARSED_COMMAND=$(echo "${{ inputs.command }}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
          echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"

      - name: Set single operations flavor
        if: ${{ !contains(inputs.op_flavor, ',')}}
        run: |
-          echo "OP_FLAVOR=[\"${{ inputs.op_flavor }}\"]" >> "${GITHUB_ENV}"
+          echo "OP_FLAVOR=[\"${INPUTS_OP_FLAVOR}\"]" >> "${GITHUB_ENV}"

      - name: Set multiple operations flavors
        if: ${{ contains(inputs.op_flavor, ',')}}
        run: |
-          PARSED_OP_FLAVOR=$(echo "${{ inputs.op_flavor }}" | sed 's/[[:space:]]*,[[:space:]]*/", "/g')
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PARSED_OP_FLAVOR=$(echo "${INPUTS_OP_FLAVOR}" | sed 's/[[:space:]]*,[[:space:]]*/", "/g')
          echo "OP_FLAVOR=[\"${PARSED_OP_FLAVOR}\"]" >> "${GITHUB_ENV}"

      - name: Set benchmark types
        run: |
-          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+          if [[ "${INPUTS_BENCH_TYPE}" == "both" ]]; then
            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
          else
-            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+            echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
          fi
+        env:
+          INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}
+
+      - name: Set parameters types
+        run: |
+          if [[ "${INPUTS_PARAMS_TYPE}" == "both" ]]; then
+            echo "PARAMS_TYPE=[\"classical\", \"multi_bit\"]" >> "${GITHUB_ENV}"
+          else
+            echo "PARAMS_TYPE=[\"${INPUTS_PARAMS_TYPE}\"]" >> "${GITHUB_ENV}"
+          fi
+        env:
+          INPUTS_PARAMS_TYPE: ${{ inputs.params_type }}
+

      - name: Set command output
        id: set_command
@@ -109,6 +136,11 @@ jobs:
        run: |
          echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"

+      - name: Set parameters types output
+        id: set_params_type
+        run: |
+          echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"
+
  setup-instance:
    name: Setup instance (cuda-${{ inputs.profile }}-benchmarks)
    needs: prepare-matrix
@@ -133,10 +165,21 @@ jobs:
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}

+      - name: Acknowledge remote instance failure
+        if: steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile != 'single-h100'
+        run: |
+          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
+          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
+          exit 1
+        env:
+          INPUTS_PROFILE: ${{ inputs.profile }}
+
      # This will allow to fallback on permanent instances running on Hyperstack.
      - name: Use permanent remote instance
        id: use-permanent-instance
-        if: steps.start-remote-instance.outcome == 'failure' &&
+        if: env.SECRETS_AVAILABLE == 'true' &&
+          steps.start-remote-instance.outcome == 'failure' &&
          inputs.profile == 'single-h100'
        run: |
          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
@@ -172,7 +215,6 @@ jobs:
    needs: [ prepare-matrix, setup-instance, install-dependencies ]
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    timeout-minutes: 1440 # 24 hours
-    continue-on-error: true
    strategy:
      fail-fast: false
      max-parallel: 1
@@ -180,6 +222,8 @@ jobs:
        command: ${{ fromJSON(needs.prepare-matrix.outputs.command) }}
        op_flavor: ${{ fromJSON(needs.prepare-matrix.outputs.op_flavor) }}
        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
+        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
+        # explicit include-based build matrix, of known valid options
        include:
          - cuda: "12.2"
            gcc: 11
@@ -231,26 +275,35 @@ jobs:

      - name: Run benchmarks
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}_gpu
+          make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" BENCH_PARAM_TYPE="${BENCH_PARAMS_TYPE}" bench_"${BENCH_COMMAND}"_gpu
+        env:
+          OP_FLAVOR: ${{ matrix.op_flavor }}
+          BENCH_TYPE: ${{ matrix.bench_type }}
+          BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
+          BENCH_COMMAND: ${{ matrix.command }}

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
-          --hardware "${{ inputs.hardware_name }}" \
+          --hardware "${INPUTS_HARDWARE_NAME}" \
          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --bench-type ${{ matrix.bench_type }}
+          --bench-type "${BENCH_TYPE}"
+        env:
+          INPUTS_HARDWARE_NAME: ${{ inputs.hardware_name }}
+          REF_NAME: ${{ github.ref_name }}
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -264,7 +317,7 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
@@ -275,7 +328,7 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-benchmarks.result }}
          SLACK_MESSAGE: "Cuda benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -299,7 +352,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_dex.yml
+++ b/.github/workflows/benchmark_gpu_dex.yml
@@ -0,0 +1,64 @@
+# Run CUDA DEX benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
+name: Cuda DEX benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
+
+permissions: {}
+
+jobs:
+  parse-inputs:
+    runs-on: ubuntu-latest
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.profile }}
+    steps:
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
+
+  run-benchmarks:
+    name: Run benchmarks
+    needs: parse-inputs
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: ${{ needs.parse-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -1,26 +1,55 @@
-# Run core crypto benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
-name: Cuda - Core crypto benchmarks
+# Run DEX benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Cuda DEX benchmarks - common

 on:
-  workflow_dispatch:
-  schedule:
-    # Weekly benchmarks will be triggered each Saturday at 1a.m.
-    - cron: '0 1 * * 6'
+  workflow_call:
+    inputs:
+      backend:
+        type: string
+        default: hyperstack
+      profile:
+        type: string
+        required: true
+      hardware_name:
+        type: string
+        required: true
+    secrets:
+      REPO_CHECKOUT_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true

 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+permissions: {}
+
 jobs:
  setup-instance:
-    name: Setup instance (cuda-core-crypto-benchmarks)
+    name: Setup instance (cuda-dex-benchmarks)
    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule' ||
+    if:  github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
@@ -39,18 +68,30 @@ jobs:
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
-          backend: hyperstack
-          profile: single-h100
+          backend: ${{ inputs.backend }}
+          profile: ${{ inputs.profile }}
+
+      - name: Acknowledge remote instance failure
+        if: steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile != 'single-h100'
+        run: |
+          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
+          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
+          exit 1
+        env:
+          INPUTS_PROFILE: ${{ inputs.profile }}

      # This will allow to fallback on permanent instances running on Hyperstack.
      - name: Use permanent remote instance
        id: use-permanent-instance
-        if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
+        if: env.SECRETS_AVAILABLE == 'true' &&
+          steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile == 'single-h100'
        run: |
          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"

-  cuda-core-crypto-benchmarks:
-    name: Execute GPU core crypto benchmarks
+  cuda-dex-benchmarks:
+    name: Cuda DEX benchmarks (${{ inputs.profile }})
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -89,30 +130,30 @@ jobs:
        with:
          toolchain: nightly

-      - name: Run benchmarks with AVX512
+      - name: Run benchmarks
        run: |
-          make bench_ks_pbs_gpu
-          make bench_pbs_gpu
-          make bench_pbs128_gpu
-          make bench_ks_gpu
+          make bench_hlapi_dex_gpu

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
-          --hardware "n3-H100x1" \
+          --hardware "${INPUTS_HARDWARE_NAME}" \
          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
-          --name-suffix avx512 \
-          --walk-subdirs
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs \
+          --name-suffix avx512
+        env:
+          INPUTS_HARDWARE_NAME: ${{ inputs.hardware_name }}
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
-          name: ${{ github.sha }}_core_crypto
+          name: ${{ github.sha }}_dex_${{ inputs.profile }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -126,26 +167,26 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
    name: Slack Notification
-    needs: [ setup-instance, cuda-core-crypto-benchmarks ]
+    needs: [ setup-instance, cuda-dex-benchmarks ]
    runs-on: ubuntu-latest
-    if: ${{ !success() && !cancelled() }}
+    if: ${{ always() && needs.cuda-dex-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
-          SLACK_COLOR: ${{ needs.cuda-core-crypto-benchmarks.result }}
-          SLACK_MESSAGE: "PBS GPU benchmarks finished with status: ${{ needs.cuda-core-crypto-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_COLOR: ${{ needs.cuda-dex-benchmarks.result }}
+          SLACK_MESSAGE: "Cuda DEX benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-dex-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
-    name: Teardown instance (cuda-integer-full-benchmarks)
+    name: Teardown instance (cuda-dex-${{ inputs.profile }}-benchmarks)
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
-    needs: [ setup-instance, cuda-core-crypto-benchmarks, slack-notify ]
+    needs: [ setup-instance, cuda-dex-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
@@ -161,7 +202,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-core-crypto-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-dex-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_dex_weekly.yml
+++ b/.github/workflows/benchmark_gpu_dex_weekly.yml
@@ -0,0 +1,61 @@
+# Run CUDA DEX benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
+name: Cuda DEX weekly benchmarks
+
+on:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 9a.m.
+    - cron: '0 9 * * 6'
+
+permissions: {}
+
+jobs:
+  run-benchmarks-1-h100:
+    name: Run benchmarks (1xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-2-h100:
+    name: Run benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-8-h100:
+    name: Run benchmarks (8xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100x8
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -10,6 +10,8 @@ on:
        type: choice
        options:
          - "l40 (n3-L40x1)"
+          - "4-l40 (n3-L40x4)"
+          - "multi-a100-nvlink (n3-A100x8-NVLink)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
          - "4-h100 (n3-H100x4)"
@@ -17,22 +19,33 @@ on:
          - "multi-h100-nvlink (n3-H100x8-NVLink)"
          - "multi-h100-sxm5 (n3-H100x8-SXM5)"

+
+permissions: {}
+
 jobs:
  parse-inputs:
    runs-on: ubuntu-latest
    outputs:
      profile: ${{ steps.parse_profile.outputs.profile }}
      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    env:
+      INPUTS_PROFILE: ${{ inputs.profile }}
    steps:
      - name: Parse profile
        id: parse_profile
        run: |
-          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"

      - name: Parse hardware name
        id: parse_hardware_name
        run: |
-          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"
+          # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
+          # shellcheck disable=SC2001
+          NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=${NAME}" >> "${GITHUB_OUTPUT}"

  run-benchmarks:
    name: Run benchmarks
@@ -41,4 +54,12 @@ jobs:
    with:
      profile: ${{ needs.parse-inputs.outputs.profile }}
      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -43,6 +43,9 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+
+permissions: {}
+
 jobs:
  setup-instance:
    name: Setup instance (cuda-erc20-benchmarks)
@@ -69,6 +72,16 @@ jobs:
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}

+      - name: Acknowledge remote instance failure
+        if: steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile != 'single-h100'
+        run: |
+          echo "Remote instance instance has failed to start (profile provided: '${INPUTS_PROFILE}')"
+          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
+          exit 1
+        env:
+          INPUTS_PROFILE: ${{ inputs.profile }}
+
      # This will allow to fallback on permanent instances running on Hyperstack.
      - name: Use permanent remote instance
        id: use-permanent-instance
@@ -124,16 +137,19 @@ jobs:

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
-          --hardware "${{ inputs.hardware_name }}" \
+          --hardware "${INPUTS_HARDWARE_NAME}" \
          --backend gpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512
+        env:
+          INPUTS_HARDWARE_NAME: ${{ inputs.hardware_name }}
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -152,7 +168,7 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

  slack-notify:
@@ -163,7 +179,7 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-erc20-benchmarks.result }}
          SLACK_MESSAGE: "Cuda ERC20 benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-erc20-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -187,7 +203,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-erc20-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_erc20_weekly.yml
+++ b/.github/workflows/benchmark_gpu_erc20_weekly.yml
@@ -6,6 +6,9 @@ on:
    # Weekly benchmarks will be triggered each Saturday at 5a.m.
    - cron: '0 5 * * 6'

+
+permissions: {}
+
 jobs:
  run-benchmarks-1-h100:
    name: Run benchmarks (1xH100)
@@ -14,7 +17,15 @@ jobs:
    with:
      profile: single-h100
      hardware_name: n3-H100x1
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-2-h100:
    name: Run benchmarks (2xH100)
@@ -23,7 +34,15 @@ jobs:
    with:
      profile: 2-h100
      hardware_name: n3-H100x2
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-8-h100:
    name: Run benchmarks (8xH100)
@@ -32,4 +51,12 @@ jobs:
    with:
      profile: multi-h100
      hardware_name: n3-H100x8
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
--- a/.github/workflows/benchmark_gpu_weekly.yml
+++ b/.github/workflows/benchmark_gpu_weekly.yml
@@ -6,9 +6,12 @@ on:
    # Weekly benchmarks will be triggered each Saturday at 1a.m.
    - cron: '0 1 * * 6'

+
+permissions: {}
+
 jobs:
  run-benchmarks-1-h100:
-    name: Run benchmarks (1xH100)
+    name: Run integer benchmarks (1xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -18,10 +21,18 @@ jobs:
      op_flavor: default
      bench_type: latency
      all_precisions: true
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-2-h100:
-    name: Run benchmarks (2xH100)
+    name: Run integer benchmarks (2xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -31,10 +42,18 @@ jobs:
      op_flavor: default
      bench_type: latency
      all_precisions: true
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-8-h100:
-    name: Run benchmarks (8xH100)
+    name: Run integer benchmarks (8xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -44,10 +63,18 @@ jobs:
      op_flavor: default
      bench_type: latency
      all_precisions: true
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}

  run-benchmarks-l40:
-    name: Run benchmarks (L40)
+    name: Run integer benchmarks (L40)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_common.yml
    with:
@@ -57,4 +84,31 @@ jobs:
      op_flavor: default
      bench_type: latency
      all_precisions: true
-    secrets: inherit
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
+
+  run-benchmarks-1-h100-core-crypto:
+    name: Run core-crypto benchmarks (1xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+      command: pbs,pbs128,ks,ks_pbs
+      bench_type: latency
+    secrets:
+      BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
+      SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      JOB_SECRET: ${{ secrets.JOB_SECRET }}
+      SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
+      SLAB_URL: ${{ secrets.SLAB_URL }}
+      SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -0,0 +1,88 @@
+# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
+name: Hpu Integer Benchmarks
+
+on:
+  workflow_dispatch:
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+
+permissions: {}
+
+jobs:
+  integer-benchmarks-hpu:
+    name: Execute integer & erc20 benchmarks for HPU backend
+    runs-on: v80-desktop
+    concurrency:
+      group: ${{ github.workflow }}_${{ github.ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    timeout-minutes: 1440  # 24 hours
+    steps:
+      # Needed as long as hw_regmap repository is private
+      - name: Configure SSH
+        uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
+        with:
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_integer_hpu
+          make bench_hlapi_erc20_hpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
+          --database tfhe_rs \
+          --hardware "hpu_x1" \
+          --backend hpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs
+        env:
+          REF_NAME: ${{ github.ref_name }}
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
+        with:
+          name: ${{ github.sha }}_integer_benchmarks
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -36,6 +36,9 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE

+
+permissions: {}
+
 jobs:
  prepare-matrix:
    name: Prepare operations matrix
@@ -60,11 +63,13 @@ jobs:
        if: github.event_name == 'workflow_dispatch'
        run: |
          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
-          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+          if [[ "${INPUTS_BENCH_TYPE}" == "both" ]]; then
            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
          else
-            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+            echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
          fi
+        env:
+          INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}

      - name: Default benchmark type
        if: github.event_name != 'workflow_dispatch'
@@ -106,7 +111,6 @@ jobs:
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    continue-on-error: true
    timeout-minutes: 1440  # 24 hours
    strategy:
      max-parallel: 1
@@ -150,26 +154,35 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_${{ matrix.command }}
+          make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" bench_"${BENCH_COMMAND}"
+        env:
+          OP_FLAVOR: ${{ matrix.op_flavor }}
+          BENCH_TYPE: ${{ matrix.bench_type }}
+          BENCH_COMMAND: ${{ matrix.command }}

      # Run these benchmarks only once per benchmark type
      - name: Run compression benchmarks with AVX512
        if: matrix.op_flavor == 'default' && matrix.command == 'integer'
        run: |
-          make BENCH_TYPE=${{ matrix.bench_type }} bench_integer_compression
+          make BENCH_TYPE="${BENCH_TYPE}" bench_integer_compression
+        env:
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --bench-type ${{ matrix.bench_type }}
+          --bench-type "${BENCH_TYPE}"
+        env:
+          REF_NAME: ${{ github.ref_name }}
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -180,13 +193,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -210,7 +223,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -22,6 +22,9 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+
+permissions: {}
+
 jobs:
  prepare-matrix:
    name: Prepare operations matrix
@@ -72,7 +75,6 @@ jobs:
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    continue-on-error: true
    strategy:
      max-parallel: 1
      matrix:
@@ -108,21 +110,23 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
+          make BENCH_OP_FLAVOR="${OP_FLAVOR}" bench_shortint
+        env:
+          OP_FLAVOR: ${{ matrix.op_flavor }}

      - name: Parse results
        run: |
-          COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
-          COMMIT_HASH="$(git describe --tags --dirty)"
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
          --project-version "${COMMIT_HASH}" \
-          --branch ${{ github.ref_name }} \
+          --branch "${REF_NAME}" \
          --commit-date "${COMMIT_DATE}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512
+        env:
+          REF_NAME: ${{ github.ref_name }}

      # This small benchmark needs to be executed only once.
      - name: Measure key sizes
@@ -133,7 +137,7 @@ jobs:
      - name: Parse key sizes results
        if: matrix.op_flavor == 'default'
        run: |
-          python3 ./ci/benchmark_parser.py tfhe/shortint_key_sizes.csv ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/shortint_key_sizes.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

@@ -146,13 +150,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Shortint full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -176,7 +180,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (shortint-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -36,6 +36,9 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_BENCH: TRUE

+
+permissions: {}
+
 jobs:
  prepare-matrix:
    name: Prepare operations matrix
@@ -60,11 +63,13 @@ jobs:
        if: github.event_name == 'workflow_dispatch'
        run: |
          echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
-          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+          if [[ "${INPUTS_BENCH_TYPE}" == "both" ]]; then
            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
          else
-            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+            echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
          fi
+        env:
+          INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}

      - name: Default benchmark type
        if: github.event_name != 'workflow_dispatch'
@@ -106,7 +111,6 @@ jobs:
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-    continue-on-error: true
    timeout-minutes: 1440  # 24 hours
    strategy:
      max-parallel: 1
@@ -150,20 +154,27 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} BENCH_TYPE=${{ matrix.bench_type }} bench_signed_${{ matrix.command }}
+          make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" bench_signed_"${BENCH_COMMAND}"
+        env:
+          OP_FLAVOR: ${{ matrix.op_flavor }}
+          BENCH_TYPE: ${{ matrix.bench_type }}
+          BENCH_COMMAND: ${{ matrix.command }}

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --bench-type ${{ matrix.bench_type }}
+          --bench-type "${BENCH_TYPE}"
+        env:
+          REF_NAME: ${{ github.ref_name }}
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -174,13 +185,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Signed integer full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -204,7 +215,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (signed-integer-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -23,6 +23,9 @@ on:
    # Job will be triggered each Thursday at 11p.m.
    - cron: '0 23 * * 4'

+
+permissions: {}
+
 jobs:
  setup-ec2:
    name: Setup EC2 instance (fft-benchmarks)
@@ -53,6 +56,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -74,14 +79,16 @@ jobs:

      - name: Parse AVX512 results
        run: |
-          python3 ./ci/fft_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/fft_benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database concrete_fft \
          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --name-suffix avx512
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -100,21 +107,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-fft benchmarks failed. (${{ env.ACTION_RUN_URL }})"
@@ -138,7 +137,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "EC2 teardown (fft-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -23,6 +23,9 @@ on:
    # Job will be triggered each Friday at 11p.m.
    - cron: "0 23 * * 5"

+
+permissions: {}
+
 jobs:
  setup-ec2:
    name: Setup EC2 instance (ntt-benchmarks)
@@ -53,6 +56,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -74,14 +79,16 @@ jobs:

      - name: Parse results
        run: |
-          python3 ./ci/ntt_benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/ntt_benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database concrete_ntt \
          --hardware "hpc7a.96xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --name-suffix avx512
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -100,21 +107,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          echo "Computing HMac on downloaded artifact"
-          SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
-          echo "Sending results to Slab..."
-          curl -v -k \
-          -H "Content-Type: application/json" \
-          -H "X-Slab-Repository: ${{ github.repository }}" \
-          -H "X-Slab-Command: store_data_v2" \
-          -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-          -d @${{ env.RESULTS_FILENAME }} \
-          ${{ secrets.SLAB_URL }}
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-ntt benchmarks failed. (${{ env.ACTION_RUN_URL }})"
@@ -138,7 +137,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "EC2 teardown (ntt-benchmarks) failed. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -30,6 +30,9 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  BENCH_TYPE: ${{ inputs.bench_type || 'latency' }}

+
+permissions: {}
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -42,10 +45,12 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            zk_pok:
@@ -114,22 +119,24 @@ jobs:

      - name: Run benchmarks
        run: |
-          make BENCH_TYPE=${{ env.BENCH_TYPE }} bench_tfhe_zk_pok
+          make BENCH_TYPE="${BENCH_TYPE}" bench_tfhe_zk_pok

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --crate tfhe-zk-pok \
          --hardware "hpc7a.96xlarge" \
          --backend cpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --bench-type ${{ env.BENCH_TYPE }}
+          --bench-type "${BENCH_TYPE}"
+        env:
+          REF_NAME: ${{ github.ref_name }}

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -148,13 +155,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-zk-pok benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -178,7 +185,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (tfhe-zk-pok-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -21,6 +21,9 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+
+permissions: {}
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -41,7 +44,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            wasm_bench:
@@ -143,15 +146,17 @@ jobs:
      - name: Parse results
        run: |
          make parse_wasm_benchmarks
-          python3 ./ci/benchmark_parser.py tfhe/wasm_pk_gen.csv ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "m6i.4xlarge" \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --key-gen
-          rm tfhe/wasm_pk_gen.csv
+          rm tfhe-benchmark/wasm_pk_gen.csv
+        env:
+          REF_NAME: ${{ github.ref_name }}

      # Run these benchmarks only once
      - name: Measure public key and ciphertext sizes in HL Api
@@ -162,7 +167,7 @@ jobs:
      - name: Parse key and ciphertext sizes results
        if:  matrix.browser == 'chrome'
        run: |
-          python3 ./ci/benchmark_parser.py tfhe/hlapi_cpk_and_cctl_sizes.csv ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/hlapi_cpk_and_cctl_sizes.csv "${RESULTS_FILENAME}" \
          --key-gen \
          --append-results

@@ -183,13 +188,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "WASM benchmarks (${{ matrix.browser }}) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -213,7 +218,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (wasm-client-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -31,6 +31,9 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+
+permissions: {}
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -48,7 +51,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            zk_pok:
@@ -74,11 +77,13 @@ jobs:
      - name: Set benchmark types
        if: github.event_name == 'workflow_dispatch'
        run: |
-          if [[ "${{ inputs.bench_type }}" == "both" ]]; then
+          if [[ "${INPUTS_BENCH_TYPE}" == "both" ]]; then
            echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
          else
-            echo "BENCH_TYPE=[\"${{ inputs.bench_type }}\"]" >> "${GITHUB_ENV}"
+            echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
          fi
+        env:
+          INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}

      - name: Default benchmark type
        if: github.event_name != 'workflow_dispatch'
@@ -156,25 +161,30 @@ jobs:

      - name: Run benchmarks with AVX512
        run: |
-          make BENCH_TYPE=${{ matrix.bench_type }} bench_integer_zk
+          make BENCH_TYPE="${BENCH_TYPE}" bench_integer_zk
+        env:
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Parse results
        run: |
-          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
          --backend cpu \
-          --project-version "${{ env.COMMIT_HASH }}" \
-          --branch ${{ github.ref_name }} \
-          --commit-date "${{ env.COMMIT_DATE }}" \
-          --bench-date "${{ env.BENCH_DATE }}" \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${REF_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
          --walk-subdirs \
          --name-suffix avx512 \
-          --bench-type ${{ matrix.bench_type }}
+          --bench-type "${BENCH_TYPE}"
+        env:
+          REF_NAME: ${{ github.ref_name }}
+          BENCH_TYPE: ${{ matrix.bench_type }}

      - name: Parse CRS sizes results
        run: |
-          python3 ./ci/benchmark_parser.py tfhe/pke_zk_crs_sizes.csv ${{ env.RESULTS_FILENAME }} \
+          python3 ./ci/benchmark_parser.py tfhe-benchmark/pke_zk_crs_sizes.csv "${RESULTS_FILENAME}" \
          --object-sizes \
          --append-results

@@ -195,13 +205,13 @@ jobs:
      - name: Send data to Slab
        shell: bash
        run: |
-          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "PKE ZK benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -225,7 +235,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (pke-zk-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -8,11 +8,15 @@ env:
  RUSTFLAGS: "-C target-cpu=native"
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

+permissions:
+  contents: read
+
 jobs:
  cargo-builds:
    runs-on: ${{ matrix.os }}
@@ -26,6 +30,9 @@ jobs:

    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
@@ -87,5 +94,10 @@ jobs:
        run: |
          make build_tfhe_coverage

+      - name: Run Hpu pcc checks
+        if: ${{ contains(matrix.os, 'ubuntu') }}
+        run: |
+          make pcc_hpu
+
      # The wasm build check is a bit annoying to set-up here and is done during the tests in
      # aws_tfhe_tests.yml
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -6,11 +6,15 @@ on:

 env:
  CARGO_TERM_COLOR: always
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

+permissions:
+  contents: read
+
 jobs:
  cargo-builds-fft:
    runs-on: ${{ matrix.runner_type }}
@@ -22,6 +26,9 @@ jobs:

    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -6,11 +6,15 @@ on:

 env:
  CARGO_TERM_COLOR: always
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

+permissions:
+  contents: read
+
 jobs:
  cargo-builds-ntt:
    runs-on: ${{ matrix.os }}
@@ -20,6 +24,9 @@ jobs:
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -10,11 +10,15 @@ on:
 env:
  CARGO_TERM_COLOR: always
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -28,10 +32,11 @@ jobs:
        with:
          fetch-depth: 0
          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            fft:
@@ -46,10 +51,13 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -74,9 +82,12 @@ jobs:
    runs-on: ${{ matrix.runner_type }}
    strategy:
      matrix:
-        runner_type: [ubuntu-latest, macos-latest, windows-latest]
+        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -98,6 +109,9 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Test node js
        run: |
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -10,12 +10,16 @@ on:
 env:
  CARGO_TERM_COLOR: always
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

-jobs:  
+permissions:
+  contents: read
+
+jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
@@ -28,10 +32,11 @@ jobs:
        with:
          fetch-depth: 0
          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            ntt:
@@ -46,10 +51,13 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -69,9 +77,12 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        os: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install Rust
        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -3,6 +3,10 @@ name: Check commit and PR compliance
 on:
  pull_request:

+permissions:
+  contents: read
+  pull-requests: read # Permission needed to scan commits in a pull-request
+
 jobs:
  check-commit-pr:
    name: Check commit and PR
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -5,9 +5,13 @@ on:
  pull_request:

 env:
-  ACTIONLINT_VERSION: 1.6.27
+  ACTIONLINT_VERSION: 1.7.7
+  ACTIONLINT_CHECKSUM: "023070a287cd8cccd71515fedc843f1985bf96c436b7effaecce67290e7e0757"
  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

+permissions:
+  contents: read
+
 jobs:
  lint-check:
    name: Lint and checks
@@ -21,17 +25,22 @@ jobs:

      - name: Get actionlint
        run: |
-          bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
-          echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782  actionlint" > checksum
+          wget "https://github.com/rhysd/actionlint/releases/download/v${{ env.ACTIONLINT_VERSION }}/actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz"
+          echo "${{ env.ACTIONLINT_CHECKSUM }} actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz" > checksum
          sha256sum -c checksum
+          tar -xf actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz actionlint
          ln -s "$(pwd)/actionlint" /usr/local/bin/

      - name: Lint workflows
        run: |
          make lint_workflow

+      - name: Check workflows security
+        run: |
+          make check_workflow_security
+
      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@25ed13d0628a1601b4b44048e63cc4328ed03633 # v3.0.22
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@4830be28ce81da52ec70d65c552a7403821d98d4 # v3.0.23
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -10,12 +10,16 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  # Code coverage workflow is only run via workflow_dispatch event since execution duration is not stabilized yet.

+permissions:
+  contents: read
+
 jobs:
  setup-instance:
    name: Setup instance (code-coverage)
@@ -45,6 +49,9 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
@@ -53,7 +60,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            tfhe:
@@ -83,7 +90,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@0565863a31f2c772f9f0395002a31e3f06189574
+        uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +104,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@0565863a31f2c772f9f0395002a31e3f06189574
+        uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -106,9 +113,9 @@ jobs:
          files: integer/cobertura.xml

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Code coverage finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -132,7 +139,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (code-coverage) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -21,6 +21,9 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read
+
 jobs:
  setup-instance:
    name: Setup instance (csprng-randomness-tests)
@@ -46,7 +49,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  csprng-randomness-tests:
    name: CSPRNG randomness tests
@@ -72,9 +75,9 @@ jobs:
          make dieharder_csprng

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -99,7 +102,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -8,7 +8,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  PR_BRANCH: ${{ github.ref_name }}
+  PR_BRANCH: ${{ github.head_ref || github.ref_name }}
  CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}

 # only trigger on pull request closed events
@@ -25,6 +25,9 @@ on:
 # the script will always return 0 because of the "echo EOF".


+
+permissions: {}
+
 jobs:
  auto_close_job:
    if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
@@ -39,14 +42,17 @@ jobs:
          curl --fail-with-body --no-progress-meter -L -X GET \
          -H "Accept: application/vnd.github+json" \
          -H "X-GitHub-Api-Version: 2022-11-28"  \
-          ${{ env.TARGET_REPO_API_URL }}/pulls\?head=${{ github.repository_owner }}:${{ env.PR_BRANCH }} | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
+          "${TARGET_REPO_API_URL}"/pulls\?head="${REPO_OWNER}":"${PR_BRANCH}" | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
          RES="$?"
          echo EOF
        } >> "${GITHUB_ENV}"
        exit $RES
+      env:
+        REPO_OWNER: ${{ github.repository_owner }}

    - name: Comment on the PR to indicate the reason of the close
      run: |
+        BODY="'{ \"body\": \"PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}\" }'"
        {
          set +e
          set -o pipefail
@@ -55,12 +61,16 @@ jobs:
          -H "Accept: application/vnd.github+json" \
          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
-          ${{ fromJson(env.TARGET_REPO_PR).comments_url }} \
-          -d '{ "body": "PR ${{ env.CLOSE_TYPE }}d because the corresponding PR in main repo was ${{ env.CLOSE_TYPE }}d: ${{ github.repository }}#${{ github.event.number  }}" }'
+          "${COMMENTS_URL}" \
+          -d "${BODY}"
          RES="$?"
          echo EOF
        } >> "${GITHUB_ENV}"
        exit $RES
+      env:
+        REPO: ${{ github.repository }}
+        EVENT_NUMBER: ${{ github.event.number }}
+        COMMENTS_URL: ${{ fromJson(env.TARGET_REPO_PR).comments_url }}

    - name: Merge the Pull Request in the data repo
      if: ${{ github.event.pull_request.merged }}
@@ -73,12 +83,14 @@ jobs:
          -H "Accept: application/vnd.github+json" \
          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
-          ${{ fromJson(env.TARGET_REPO_PR).url }}/merge \
+          "${TARGET_REPO_PR_URL}"/merge \
          -d '{ "merge_method": "rebase" }'
          RES="$?"
          echo EOF
        } >> "${GITHUB_ENV}"
        exit $RES
+      env:
+        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}

    - name: Close the Pull Request in the data repo
      if: ${{ !github.event.pull_request.merged }}
@@ -91,12 +103,14 @@ jobs:
          -H "Accept: application/vnd.github+json" \
          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
-          ${{ fromJson(env.TARGET_REPO_PR).url }} \
+          "${TARGET_REPO_PR_URL}" \
          -d '{ "state": "closed" }'
          RES="$?"
          echo EOF
        } >> "${GITHUB_ENV}"
        exit $RES
+      env:
+        TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}

    - name: Delete the associated branch in the data repo
      run: |
@@ -108,7 +122,7 @@ jobs:
          -H "Accept: application/vnd.github+json" \
          -H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
          -H "X-GitHub-Api-Version: 2022-11-28" \
-          ${{ env.TARGET_REPO_API_URL }}/git/refs/heads/${{ env.PR_BRANCH }}
+          "${TARGET_REPO_API_URL}"/git/refs/heads/"${PR_BRANCH}"
          RES="$?"
          echo EOF
        } >> "${GITHUB_ENV}"
@@ -117,7 +131,7 @@ jobs:
    - name: Slack Notification
      if: ${{ always() && job.status == 'failure' }}
      continue-on-error: true
-      uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+      uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
      env:
        SLACK_COLOR: ${{ job.status }}
        SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -22,6 +22,9 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

+permissions:
+  contents: read
+
 jobs:
  cuda-tests-linux:
    name: CUDA tests (RTX 4090)
@@ -77,9 +80,9 @@ jobs:
          github_token: ${{ secrets.GITHUB_TOKEN }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -25,6 +25,9 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -42,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -102,7 +105,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
    name: CUDA H100 tests
@@ -169,11 +172,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -198,7 +203,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -24,6 +24,9 @@ on:
  workflow_dispatch:
  pull_request:

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -41,7 +44,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -87,7 +90,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
    name: CUDA tests
@@ -153,11 +156,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -182,7 +187,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -15,6 +15,9 @@ env:
 on:
  workflow_dispatch:

+
+permissions: {}
+
 jobs:
  setup-instance:
    name: Setup instance (cuda-h100-tests)
@@ -62,18 +65,6 @@ jobs:
            cuda: "12.2"
            gcc: 11 
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
@@ -116,7 +107,7 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -140,7 +131,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -25,6 +25,9 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -42,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -89,7 +92,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
    name: CUDA multi-GPU tests
@@ -146,7 +149,7 @@ jobs:

      - name: Run High Level API Tests
        run: |
-          BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
+          BIG_TESTS_INSTANCE=FALSE make test_high_level_api_gpu

  slack-notify:
    name: Slack Notification
@@ -158,11 +161,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -187,7 +192,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -10,6 +10,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -18,6 +19,9 @@ on:
    # Nightly tests will be triggered each evening 8p.m.
    - cron: "0 20 * * *"

+permissions:
+  contents: read
+
 jobs:
  setup-instance:
    name: Setup instance (gpu-tests)
@@ -57,6 +61,9 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/gpu_setup
@@ -81,7 +88,7 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests.result }}
          SLACK_MESSAGE: "Integer GPU long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"
@@ -105,7 +112,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (gpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -1,4 +1,4 @@
-# Perfom tfhe-cuda-backend post-commit checks on an AWS instance
+# Perform tfhe-cuda-backend post-commit checks on an AWS instance
 name: Cuda - Post-commit Checks

 env:
@@ -17,10 +17,15 @@ env:
  # Secrets will be available only to zama-ai organization members
  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16-22.04"
+  CUDA_KEYRING_PACKAGE: cuda-keyring_1.1-1_all.deb
+  CUDA_KEYRING_SHA: "d93190d50b98ad4699ff40f4f7af50f16a76dac3bb8da1eaaf366d47898ff8df"

 on:
  pull_request:

+permissions:
+  contents: read
+
 jobs:
  setup-instance:
    name: Setup instance (cuda-pcc)
@@ -45,7 +50,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-pcc:
    name: CUDA post-commit checks
@@ -77,8 +82,10 @@ jobs:
        shell: bash
        run: |
          TOOLKIT_VERSION="$(echo ${{ matrix.cuda }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
-          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"${CUDA_KEYRING_PACKAGE}"
+          echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
+          sha256sum -c checksum
+          sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
          sudo apt update
          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format

@@ -113,15 +120,21 @@ jobs:
        run: |
          make pcc_gpu

+      - name: Check build with hpu enabled
+        run: |
+          make clippy_gpu_hpu
+
      - name: Set pull-request URL
        if: ${{ failure() && github.event_name == 'pull_request' }}
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Slack Notification
        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -146,7 +159,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-pcc) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -25,6 +25,9 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -42,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -89,7 +92,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
    name: CUDA signed integer tests with classical PBS
@@ -141,11 +144,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -170,7 +175,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -25,6 +25,8 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read

 jobs:
  should-run:
@@ -43,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -103,7 +105,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
    name: CUDA H100 signed integer tests
@@ -156,11 +158,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -185,7 +189,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -29,6 +29,9 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -46,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -93,7 +96,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-signed-integer-tests:
    name: CUDA signed integer tests
@@ -153,11 +156,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
          SLACK_MESSAGE: "Signed GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -182,7 +187,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -25,6 +25,8 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read

 jobs:
  should-run:
@@ -43,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -90,7 +92,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
    name: CUDA unsigned integer tests with classical PBS
@@ -142,11 +144,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -171,7 +175,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -25,6 +25,9 @@ on:
  pull_request:
    types: [ labeled ]

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -42,7 +45,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -102,7 +105,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-tests-linux:
    name: CUDA H100 unsigned integer tests
@@ -155,11 +158,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -184,7 +189,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -29,6 +29,9 @@ on:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

+permissions:
+  contents: read
+
 jobs:
  should-run:
    runs-on: ubuntu-latest
@@ -46,7 +49,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@26a38635fc1173cc5820336ce97be6188d0de9f5 # v46.0.2
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        with:
          files_yaml: |
            gpu:
@@ -93,7 +96,7 @@ jobs:
        id: start-github-instance
        if: env.SECRETS_AVAILABLE == 'false'
        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"

  cuda-unsigned-integer-tests:
    name: CUDA unsigned integer tests
@@ -153,11 +156,13 @@ jobs:
      - name: Set pull-request URL
        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
        run: |
-          echo "PULL_REQUEST_MD_LINK=[pull-request](${{ vars.PR_BASE_URL }}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}

      - name: Send message
        if: env.SECRETS_AVAILABLE == 'true'
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
@@ -182,7 +187,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -0,0 +1,73 @@
+# Test tfhe-fft
+name: Cargo Test HLAPI HPU
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+env:
+  CARGO_TERM_COLOR: always
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+
+permissions: { }
+
+jobs:
+  should-run:
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: read
+    outputs:
+      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Check for file changes
+        id: changed-files
+        uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
+        with:
+          files_yaml: |
+            hpu:
+              - tfhe/Cargo.toml
+              - Makefile
+              - backends/tfhe-hpu-backend/**
+              - mockups/tfhe-hpu-mockup/**
+
+  cargo-tests-hpu:
+    needs: should-run
+    if: needs.should-run.outputs.hpu_test == 'true'
+    runs-on: large_ubuntu_16
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
+        with:
+          toolchain: stable
+          override: true
+
+      - name: Install Just
+        run: |
+          cargo install just
+
+      - name: Test HLAPI HPU
+        run: |
+          source setup_hpu.sh
+          just -f mockups/tfhe-hpu-mockup/Justfile  BUILD_PROFILE=release mockup &
+          make HPU_CONFIG=sim test_high_level_api_hpu
+     
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -18,6 +18,9 @@ on:
    # Weekly tests will be triggered each Friday at 9p.m.
    - cron: "0 21 * * 5"

+
+permissions: {}
+
 jobs:
  setup-instance:
    name: Setup instance (cpu-tests)
@@ -63,9 +66,9 @@ jobs:
          make test_integer_long_run

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "CPU long run tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -89,7 +92,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cpu-long-run-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -27,6 +27,9 @@ concurrency:
  group: ${{ github.workflow_ref }}
  cancel-in-progress: true

+permissions:
+  contents: read
+
 jobs:
  cargo-builds-m1:
    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') ||
@@ -190,7 +193,7 @@ jobs:
      - name: Slack Notification
        if: ${{ needs.cargo-builds-m1.result != 'skipped' }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cargo-builds-m1.result }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -28,6 +28,12 @@ on:
 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  NPM_TAG: ""
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}

 jobs:
  verify_tag:
@@ -94,7 +100,7 @@ jobs:
        run: |
          echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
      - name: Download artifact
-        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          name: crate
          path: target/package
@@ -104,7 +110,10 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p tfhe --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe --token "${CRATES_TOKEN}" ${DRY_RUN}

      - name: Generate hash
        id: published_hash
@@ -113,14 +122,10 @@ jobs:
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "SLSA tfhe crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

      - name: Build web package
        if: ${{ inputs.push_web_package }}
@@ -156,13 +161,9 @@ jobs:
          provenance: true

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "tfhe release failed: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -15,6 +15,8 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+permissions: {}
+
 jobs:
  verify_tag:
    uses: ./.github/workflows/verify_tagged_commit.yml
@@ -157,7 +159,10 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p tfhe-cuda-backend --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-cuda-backend --token "${CRATES_TOKEN}" ${DRY_RUN}

      - name: Generate hash
        id: published_hash
@@ -166,19 +171,15 @@ jobs:
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "SLSA tfhe-cuda-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-cuda-backend release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -202,7 +203,7 @@ jobs:
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (publish-cuda-release) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_hpu.yml
+++ b/.github/workflows/make_release_hpu.yml
@@ -0,0 +1,105 @@
+name: Publish HPU release
+
+on:
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry-run"
+        type: boolean
+        default: true
+
+env:
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}
+
+jobs:
+  verify_tag:
+    uses: ./.github/workflows/verify_tagged_commit.yml
+    secrets:
+      RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
+      READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
+
+  package:
+    runs-on: ubuntu-latest
+    needs: verify_tag
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+      - name: Prepare package
+        run: |
+          cargo package -p tfhe-hpu-backend
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: crate
+          path: target/package/*.crate
+      - name: generate hash
+        id: hash
+        run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+  provenance:
+    if: ${{ !inputs.dry_run  }}
+    needs: [package]
+    uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
+    permissions:
+      # Needed to detect the GitHub Actions environment
+      actions: read
+      # Needed to create the provenance via GitHub OIDC
+      id-token: write
+      # Needed to upload assets/artifacts
+      contents: write
+    with:
+      # SHA-256 hashes of the Crate package.
+      base64-subjects: ${{ needs.package.outputs.hash }}
+
+  publish_release:
+    name: Publish tfhe-hpu-backend Release
+    runs-on: ubuntu-latest
+    needs: [verify_tag, package] # for comparing hashes
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Publish crate.io package
+        env:
+          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
+        run: |
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
+
+      - name: Generate hash
+        id: published_hash
+        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
+
+      - name: Slack notification (hashes comparison)
+        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
+        env:
+          SLACK_COLOR: failure
+          SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -10,6 +10,12 @@ on:

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}

 jobs:
  verify_tag:
@@ -27,6 +33,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe-csprng
@@ -64,9 +72,10 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          name: crate-tfhe-csprng
          path: target/package
@@ -75,29 +84,24 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p tfhe-csprng --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-csprng --token "${CRATES_TOKEN}" ${DRY_RUN}
      - name: Generate hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "SLSA tfhe-csprng - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "tfhe-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -11,6 +11,12 @@ on:

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}

 jobs:
  verify_tag:
@@ -29,7 +35,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe-fft
@@ -65,14 +72,18 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p tfhe-fft --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-fft --token "${CRATES_TOKEN}" ${DRY_RUN}

      - name: Generate hash
        id: published_hash
@@ -81,23 +92,15 @@ jobs:
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "SLSA tfhe-fft crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "tfhe-fft release failed: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -11,6 +11,12 @@ on:

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}

 jobs:
  verify_tag:
@@ -29,7 +35,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe-ntt
@@ -65,13 +72,18 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p tfhe-ntt --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-ntt --token "${CRATES_TOKEN}" ${DRY_RUN}

      - name: Generate hash
        id: published_hash
@@ -80,23 +92,15 @@ jobs:
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "SLSA tfhe-ntt crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "tfhe-ntt release failed: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -10,6 +10,8 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

+permissions: {}
+
 jobs:
  verify_tag:
    uses: ./.github/workflows/verify_tagged_commit.yml
@@ -27,6 +29,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe-versionable-derive
@@ -64,7 +68,7 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          name: crate-tfhe-versionable-derive
          path: target/package
@@ -72,21 +76,21 @@ jobs:
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
        run: |
-          cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
+          cargo publish -p tfhe-versionable-derive --token "${CRATES_TOKEN}"
      - name: Generate hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package-derive.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
          SLACK_MESSAGE: "SLSA tfhe-versionable-derive - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-versionable-derive release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
@@ -102,6 +106,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe-versionable
@@ -136,8 +142,10 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          name: crate-tfhe-versionable
          path: target/package
@@ -145,21 +153,21 @@ jobs:
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
        run: |
-          cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
+          cargo publish -p tfhe-versionable --token "${CRATES_TOKEN}"
      - name: Generate hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
          SLACK_MESSAGE: "SLSA tfhe-versionable - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -10,6 +10,12 @@ on:

 env:
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+permissions: {}

 jobs:
  package:
@@ -21,6 +27,8 @@ jobs:
          uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
          with:
            fetch-depth: 0
+            persist-credentials: 'false'
+            token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
        - name: Prepare package
          run: |
            cargo package -p tfhe-zk-pok
@@ -64,7 +72,7 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
+        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
        with:
          name: crate-zk-pok
          path: target/package
@@ -73,29 +81,24 @@ jobs:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
-          cargo publish -p tfhe-zk-pok --token ${{ env.CRATES_TOKEN }} ${{ env.DRY_RUN }}
+          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
+          # would fail. This is safe since DRY_RUN is handled in the env section above.
+          # shellcheck disable=SC2086
+          cargo publish -p tfhe-zk-pok --token "${CRATES_TOKEN}" ${DRY_RUN}
      - name: Verify hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
      - name: Slack notification (hashes comparison)
        if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: failure
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "SLSA tfhe-zk-pok crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
      - name: Slack Notification
-        if: ${{ failure() }}
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990 # v2.3.2
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
-          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
          SLACK_MESSAGE: "tfhe-zk-pok release failed: (${{ env.ACTION_RUN_URL }})"
-          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -12,12 +12,17 @@ on:
      - "main"
  workflow_dispatch:

+permissions: {}
+
 jobs:
  params-curves-security-check:
    runs-on: large_ubuntu_16-22.04
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Checkout lattice-estimator
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -25,6 +30,7 @@ jobs:
          repository: malb/lattice-estimator
          path: lattice_estimator
          ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'
+          persist-credentials: 'false'

      - name: Install Sage
        run: |
@@ -42,7 +48,7 @@ jobs:
      - name: Slack Notification
        if: ${{ always() }}
        continue-on-error: true
-        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
--- a/.github/workflows/placeholder_workflow.yml
+++ b/.github/workflows/placeholder_workflow.yml
@@ -4,6 +4,8 @@ name: Placeholder Workflow
 on:
  workflow_dispatch:

+permissions: {}
+
 jobs:
  placeholder:
    name: Placeholder
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -7,6 +7,8 @@ on:
      - 'main'
  workflow_dispatch:

+permissions: {}
+
 jobs:
  sync-repo:
    if: ${{ github.repository == 'zama-ai/tfhe-rs' }}
--- a/.github/workflows/verify_tagged_commit.yml
+++ b/.github/workflows/verify_tagged_commit.yml
@@ -9,6 +9,8 @@ on:
      READ_ORG_TOKEN:
        required: true

+permissions: {}
+
 jobs:
  checks:
    runs-on: ubuntu-latest
@@ -26,7 +28,10 @@ jobs:

      - name: Actor authorized
        run: |
-          if [ "${{ steps.actor_check.outputs.authorized }}" == "false" ]; then
-            echo "Actor '${{ github.triggering_actor }}' is not authorized to perform release"
+          if [ "${ACTOR_CHECK_OUTPUT}" == "false" ]; then
+            echo "Actor '${TRIGGERING_ACTOR}' is not authorized to perform release"
            exit 1
          fi
+        env:
+          TRIGGERING_ACTOR: ${{ github.triggering_actor }}
+          ACTOR_CHECK_OUTPUT: ${{ steps.actor_check.outputs.authorized }}
--- a/.gitignore
+++ b/.gitignore
@@ -34,7 +34,9 @@ package-lock.json

 # Python .env
 .env
+__pycache__

 # Dir used for backward compatibility test data
-tests/tfhe-backward-compat-data/
+# First directive is to ignore symlinks
+tests/tfhe-backward-compat-data
 ci/
--- a/.lfsconfig
+++ b/.lfsconfig
@@ -0,0 +1,2 @@
+[lfs]
+  fetchexclude = *
--- a/18
+++ b/18
@@ -0,0 +1,18 @@
+# Specifying a path without code owners means that path won't have owners and is akin to a negation
+# i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
+# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
+/tfhe/src/core_crypto/                  @IceTDrinker
+/tfhe/src/core_crypto/gpu
+
+/tfhe/src/shortint/                     @mayeul-zama
+
+/tfhe/src/integer/                      @tmontaigu
+/tfhe/src/integer/gpu
+
+/tfhe/src/high_level_api/               @tmontaigu
+
+/Makefile                               @IceTDrinker @soonum
+
+/.github/                               @soonum
+
+/CODEOWNERS                             @IceTDrinker
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -102,8 +102,7 @@ For example, if you made changes in `tfhe/src/integer/*`, you can test them with
 ## 4. Committing

 **TFHE-rs** follows the conventional commit specification to maintain a consistent commit history, essential for Semantic Versioning ([semver.org](https://semver.org/)).
-Commit messages are automatically checked in CI and will be rejected if they do not comply, so make sure that you follow the commit conventions detailed on [this page]
-(https://www.conventionalcommits.org/en/v1.0.0/).
+Commit messages are automatically checked in CI and will be rejected if they do not comply, so make sure that you follow the commit conventions detailed on [this page](https://www.conventionalcommits.org/en/v1.0.0/).

 ## 5. Rebasing

@@ -145,12 +144,15 @@ sequenceDiagram
    Reviewer -->> GitHub: Merge if pipeline green
 ```

-> [!Note]
->Useful details:
->* pipeline is triggered by humans
->* review team is located in Paris timezone, pipeline launch will most likely happen during office hours
->* direct changes to CI related files are not allowed for external contributors
->* run `make pcc` to fix any build errors before pushing commits
+{% hint style="info" %}
+
+## Useful details:
+
+- pipeline is triggered by humans
+- review team is located in Paris timezone, pipeline launch will most likely happen during office hours
+- direct changes to CI related files are not allowed for external contributors
+- run `make pcc` to fix any build errors before pushing commits
+{% endhint %}

 ## 8. Data versioning

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,15 +2,19 @@
 resolver = "2"
 members = [
    "tfhe",
+    "tfhe-benchmark",
    "tfhe-fft",
    "tfhe-ntt",
    "tfhe-zk-pok",
    "tasks",
    "tfhe-csprng",
    "backends/tfhe-cuda-backend",
+    "backends/tfhe-hpu-backend",
    "utils/tfhe-versionable",
    "utils/tfhe-versionable-derive",
+    "utils/param_dedup",
    "tests",
+    "mockups/tfhe-hpu-mockup",
 ]

 exclude = [
--- a/314
+++ b/314
@@ -2,6 +2,7 @@ SHELL:=$(shell /usr/bin/env which bash)
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
+CARGO_BUILD_JOBS=default
 CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -55,6 +56,9 @@ REGEX_PATTERN?=''
 TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
 TFHECUDA_BUILD=$(TFHECUDA_SRC)/build

+# tfhe-hpu-backend
+HPU_CONFIG=v80
+
 # Exclude these files from coverage reports
 define COVERAGE_EXCLUDED_FILES
 --exclude-files apps/trivium/src/trivium/* \
@@ -163,6 +167,12 @@ install_typos_checker: install_rs_build_toolchain
 	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
 	( echo "Unable to install typos-cli, unknown error." && exit 1 )

+.PHONY: install_zizmor # Install zizmor workflow security checker
+install_zizmor: install_rs_build_toolchain
+	@zizmor --version > /dev/null 2>&1 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor || \
+	( echo "Unable to install zizmor, unknown error." && exit 1 )
+
 .PHONY: setup_venv # Setup Python virtualenv for wasm tests
 setup_venv:
 	python3 -m venv venv
@@ -284,7 +294,7 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

@@ -295,6 +305,20 @@ check_gpu: install_rs_check_toolchain
 		--all-targets \
 		-p $(TFHE_SPEC)

+.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
+clippy_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
+		--all-targets \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+
+.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
+clippy_gpu_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types \
+		--all-targets \
+		-p $(TFHE_SPEC) -- --no-deps -D warnings
+
 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
 	linelint -a .
@@ -307,6 +331,10 @@ check_newline: check_linelint_installed
 lint_workflow: check_actionlint_installed
 	actionlint

+.PHONY: check_workflow_security # Run zizmor security checker on GitHub workflows
+check_workflow_security: install_zizmor
+	zizmor --persona pedantic .
+
 .PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -366,7 +394,7 @@ clippy_rustdoc: install_rs_check_toolchain
 		echo "WARNING: skipped clippy_rustdoc, unsupported OS $(OS)"; \
 		exit 0; \
 	fi && \
-	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
+	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
 		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental \
 		-p $(TFHE_SPEC)
@@ -377,7 +405,7 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 		echo "WARNING: skipped clippy_rustdoc_gpu, unsupported OS $(OS)"; \
 		exit 0; \
 	fi && \
-	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
+	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
 		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
 		-p $(TFHE_SPEC)
@@ -407,6 +435,11 @@ clippy_trivium: install_rs_check_toolchain
 	cd apps/trivium; RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-trivium -- --no-deps -D warnings

+.PHONY: clippy_ws_tests # Run clippy on the workspace level tests
+clippy_ws_tests: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --tests \
+		-p tests --features=shortint,integer,zk-pok -- --no-deps -D warnings
+
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
@@ -439,10 +472,15 @@ clippy_tfhe_lints: install_cargo_dylint # the toolchain is selected with toolcha
 	rustup toolchain install && \
 	cargo clippy --all-targets -- --no-deps -D warnings

+.PHONY: clippy_param_dedup # Run clippy lints on param_dedup tool
+clippy_param_dedup: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p param_dedup -- --no-deps -D warnings
+
 .PHONY: clippy_all # Run all clippy targets
 clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
 clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_tfhe_csprng clippy_zk_pok clippy_trivium \
-clippy_versionable clippy_tfhe_lints
+clippy_versionable clippy_tfhe_lints clippy_ws_tests clippy_bench clippy_param_dedup

 .PHONY: clippy_fast # Run main clippy targets
 clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -453,6 +491,11 @@ clippy_cuda_backend: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		-p tfhe-cuda-backend -- --no-deps -D warnings

+.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
+clippy_hpu_backend: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		-p tfhe-hpu-backend -- --no-deps -D warnings
+
 .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
 check_rust_bindings_did_not_change:
 	cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -682,6 +725,28 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

+.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
+test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
+	cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
+
+.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
+test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
+	source ./setup_hpu.sh --config sim ; \
+	cargo build --release --bin hpu_mockup; \
+    coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
+	HPU_TEST_ITER=1 \
+	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	kill %1
+
+.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
+test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
+	source ./setup_hpu.sh --config sim ; \
+	cargo build --profile devo --bin hpu_mockup; \
+    coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
+	HPU_TEST_ITER=1 \
+	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	kill %1
+
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -837,6 +902,22 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 		--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
 		-E "test(/high_level_api::.*gpu.*/)"

+test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
+ifeq ($(HPU_CONFIG), v80)
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--build-jobs=$(CARGO_BUILD_JOBS) \
+		--test-threads=1 \
+		--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*hpu.*/)"
+else
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
+		--build-jobs=$(CARGO_BUILD_JOBS) \
+		--test-threads=1 \
+		--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
+		-E "test(/high_level_api::.*hpu.*/)"
+endif
+
+
 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -954,8 +1035,8 @@ lint_doc: install_rs_check_toolchain
 lint_docs: lint_doc

 .PHONY: format_doc_latex # Format the documentation latex equations to avoid broken rendering.
-format_doc_latex:
-	RUSTFLAGS="" cargo xtask format_latex_doc
+format_doc_latex: install_rs_build_toolchain
+	RUSTFLAGS="" cargo "$(CARGO_RS_BUILD_TOOLCHAIN)" xtask format_latex_doc
 	@"$(MAKE)" --no-print-directory fmt
 	@printf "\n===============================\n\n"
 	@printf "Please manually inspect changes made by format_latex_doc, rustfmt can break equations \
@@ -963,8 +1044,8 @@ format_doc_latex:
 	@printf "\n===============================\n"

 .PHONY: check_md_docs_are_tested # Checks that the rust codeblocks in our .md files are tested
-check_md_docs_are_tested:
-	RUSTFLAGS="" cargo xtask check_tfhe_docs_are_tested
+check_md_docs_are_tested: install_rs_build_toolchain
+	RUSTFLAGS="" cargo "$(CARGO_RS_BUILD_TOOLCHAIN)" xtask check_tfhe_docs_are_tested

 .PHONY: check_intra_md_links # Checks broken internal links in Markdown docs
 check_intra_md_links: install_mlc
@@ -1068,6 +1149,24 @@ dieharder_csprng: install_dieharder build_tfhe_csprng
 # Benchmarks
 #

+.PHONY: clippy_bench # Run clippy lints on tfhe-benchmark
+clippy_bench: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--features=boolean,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
+		-p tfhe-benchmark -- --no-deps -D warnings
+
+.PHONY: clippy_bench_gpu # Run clippy lints on tfhe-benchmark
+clippy_bench_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
+		-p tfhe-benchmark -- --no-deps -D warnings
+
+.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark
+clippy_bench_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--features=hpu,shortint,integer,internal-keycache,pbs-stats\
+		-p tfhe-benchmark -- --no-deps -D warnings
+
 .PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
 print_doc_bench_parameters:
 	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
@@ -1078,42 +1177,57 @@ bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_signed_integer_gpu # Run benchmarks for signed integer on GPU backend
 bench_signed_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
+bench_integer_hpu: install_rs_check_toolchain
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench integer-bench \
+	--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
+
+.PHONY: bench_integer_zk_gpu
+bench_integer_zk_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench zk-pke-bench \
+	--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -1121,7 +1235,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1129,7 +1243,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1137,7 +1251,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_signed_integer_multi_bit_gpu # Run benchmarks for signed integer on GPU backend using multi-bit parameters
 bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1145,7 +1259,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
@@ -1153,86 +1267,83 @@ bench_integer_zk: install_rs_check_toolchain
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
 	--features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \
-	-p $(TFHE_SPEC) --
+	-p tfhe-benchmark --

 .PHONY: bench_shortint # Run benchmarks for shortint
 bench_shortint: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=shortint,internal-keycache,nightly-avx512 -p tfhe-benchmark

 .PHONY: bench_shortint_oprf # Run benchmarks for shortint
 bench_shortint_oprf: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-shortint-bench \
-	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
-bench_shortint_multi_bit: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT \
-	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench shortint-bench \
-	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=shortint,internal-keycache,nightly-avx512 -p tfhe-benchmark

 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench boolean-bench \
-	--features=boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_pbs # Run benchmarks for PBS
-bench_pbs: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench pbs-bench \
-	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_ks_pbs # Run benchmarks for KS-PBS
-bench_ks_pbs: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench ks-pbs-bench \
-	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_ks_pbs_gpu # Run benchmarks for KS-PBS on GPU backend
-bench_ks_pbs_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench ks-pbs-bench \
-	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_pbs128 # Run benchmarks for PBS using FFT 128 bits
-bench_pbs128: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench pbs128-bench \
-	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_pbs128_gpu # Run benchmarks for PBS using FFT 128 bits on GPU
-bench_pbs128_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench pbs128-bench \
-	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
-
-.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
-bench_pbs_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) \
-	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
-	--bench pbs-bench \
-	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,internal-keycache,nightly-avx512 -p tfhe-benchmark

 .PHONY: bench_ks # Run benchmarks for keyswitch
 bench_ks: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench ks-bench \
-	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,shortint,internal-keycache,nightly-avx512 -p tfhe-benchmark

-.PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
+.PHONY: bench_ks_gpu # Run benchmarks for keyswitch on GPU backend
 bench_ks_gpu: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench ks-bench \
-	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark
+
+.PHONY: bench_pbs # Run benchmarks for PBS
+bench_pbs: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs-bench \
+	--features=boolean,shortint,internal-keycache,nightly-avx512 -p tfhe-benchmark
+
+.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
+bench_pbs_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs-bench \
+	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark
+
+.PHONY: bench_ks_pbs # Run benchmarks for KS-PBS
+bench_ks_pbs: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench ks-pbs-bench \
+	--features=boolean,shortint,internal-keycache,nightly-avx512 -p tfhe-benchmark
+
+.PHONY: bench_ks_pbs_gpu # Run benchmarks for KS-PBS on GPU backend
+bench_ks_pbs_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_PARAMS_SET=$(BENCH_PARAMS_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench ks-pbs-bench \
+	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark
+
+.PHONY: bench_pbs128 # Run benchmarks for PBS using FFT 128 bits
+bench_pbs128: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs128-bench \
+	--features=boolean,shortint,internal-keycache,nightly-avx512 -p tfhe-benchmark
+
+.PHONY: bench_pbs128_gpu # Run benchmarks for PBS using FFT 128 bits on GPU
+bench_pbs128_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench pbs128-bench \
+	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark

 bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
 bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
@@ -1264,17 +1375,37 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_parallel_firefox

-.PHONY: bench_hlapi_erc20 # Run benchmarks for ECR20 operations
+.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
 bench_hlapi_erc20: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --

-.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ECR20 operations on GPU
+.PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ERC20 operations on GPU
 bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_dex # Run benchmarks for DEX operations
+bench_hlapi_dex: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_dex_gpu # Run benchmarks for DEX operations on GPU
+bench_hlapi_dex_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
+bench_hlapi_erc20_hpu: install_rs_check_toolchain
+	source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
+	RUSTFLAGS="$(RUSTFLAGS)" \
+	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-erc20 \
+	--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
@@ -1301,20 +1432,23 @@ gen_key_cache_core_crypto: install_rs_build_toolchain
 .PHONY: measure_hlapi_compact_pk_ct_sizes # Measure sizes of public keys and ciphertext for high-level API
 measure_hlapi_compact_pk_ct_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example hlapi_compact_pk_ct_sizes \
-	--features=integer,internal-keycache
+	--bin hlapi_compact_pk_ct_sizes \
+	--features=integer,internal-keycache \
+	-p tfhe-benchmark

 .PHONY: measure_shortint_key_sizes # Measure sizes of bootstrapping and key switching keys for shortint
 measure_shortint_key_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example shortint_key_sizes \
-	--features=shortint,internal-keycache
+	--bin shortint_key_sizes \
+	--features=shortint,internal-keycache \
+	-p tfhe-benchmark

 .PHONY: measure_boolean_key_sizes # Measure sizes of bootstrapping and key switching keys for boolean
 measure_boolean_key_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example boolean_key_sizes \
-	--features=boolean,internal-keycache
+	--bin boolean_key_sizes \
+	--features=boolean,internal-keycache \
+	-p tfhe-benchmark

 .PHONY: parse_integer_benches # Run python parser to output a csv containing integer benches data
 parse_integer_benches:
@@ -1325,8 +1459,9 @@ parse_integer_benches:
 .PHONY: parse_wasm_benchmarks # Parse benchmarks performed with WASM web client into a CSV file
 parse_wasm_benchmarks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example wasm_benchmarks_parser \
+	--bin wasm_benchmarks_parser \
 	--features=shortint,internal-keycache \
+	-p tfhe-benchmark \
 	-- wasm_benchmark_results.json

 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
@@ -1369,7 +1504,10 @@ tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
-clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
+clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
+
+.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
+pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
 fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
--- a/README.md
+++ b/README.md
@@ -206,6 +206,7 @@ If you want to work within the IND-CPA security model, which is less strict than
 The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-64}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].

 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf
+
 [2] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf

 #### Side-channel attacks
--- a/_typos.toml
+++ b/_typos.toml
@@ -11,11 +11,13 @@ extend-ignore-identifiers-re = [
    # Example with string replacing "hello" with "herlo"
    "herlo",
    # Example in trivium
-    "C9217BA0D762ACA1"
+    "C9217BA0D762ACA1",
+    "0x[0-9a-fA-F]+"
 ]

 [files]
 extend-exclude = [
    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
+    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
 ]
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -137,10 +137,10 @@ its setup a little more intricate.
 Example code:
 ```rust
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::parameters::v1_1::{
-    V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_1::{
-    V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_1::{
-    V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,9 +1,9 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_1::{
-    V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -66,7 +66,7 @@ fn get_hexagonal_string_from_bytes(a: Vec<u8>) -> String {
    assert!(a.len() % 8 == 0);
    let mut hexadecimal: String = "".to_string();
    for test in a {
-        hexadecimal.push_str(&format!("{:02X?}", test));
+        hexadecimal.push_str(&format!("{test:02X?}"));
    }
    hexadecimal
 }
@@ -74,7 +74,7 @@ fn get_hexagonal_string_from_bytes(a: Vec<u8>) -> String {
 fn get_hexagonal_string_from_u64(a: Vec<u64>) -> String {
    let mut hexadecimal: String = "".to_string();
    for test in a {
-        hexadecimal.push_str(&format!("{:016X?}", test));
+        hexadecimal.push_str(&format!("{test:016X?}"));
    }
    hexadecimal
 }
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/static_deque/static_deque.rs
+++ b/apps/trivium/src/static_deque/static_deque.rs
@@ -55,7 +55,7 @@ impl<const N: usize, T> Index<usize> for StaticDeque<N, T> {
    /// 0 is youngest
    fn index(&self, i: usize) -> &T {
        if i >= N {
-            panic!("Index {:?} too high for size {:?}", i, N);
+            panic!("Index {i:?} too high for size {N:?}");
        }
        &self.arr[(N + self.cursor - i - 1) % N]
    }
@@ -66,7 +66,7 @@ impl<const N: usize, T> IndexMut<usize> for StaticDeque<N, T> {
    /// 0 is youngest
    fn index_mut(&mut self, i: usize) -> &mut T {
        if i >= N {
-            panic!("Index {:?} too high for size {:?}", i, N);
+            panic!("Index {i:?} too high for size {N:?}");
        }
        &mut self.arr[(N + self.cursor - i - 1) % N]
    }
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,9 +1,9 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_1::{
-    V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::v1_2::{
+    V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -66,7 +66,7 @@ fn get_hexagonal_string_from_bytes(a: Vec<u8>) -> String {
    assert!(a.len() % 8 == 0);
    let mut hexadecimal: String = "".to_string();
    for test in a {
-        hexadecimal.push_str(&format!("{:02X?}", test));
+        hexadecimal.push_str(&format!("{test:02X?}"));
    }
    hexadecimal
 }
@@ -74,7 +74,7 @@ fn get_hexagonal_string_from_bytes(a: Vec<u8>) -> String {
 fn get_hexagonal_string_from_u64(a: Vec<u64>) -> String {
    let mut hexadecimal: String = "".to_string();
    for test in a {
-        hexadecimal.push_str(&format!("{:016X?}", test));
+        hexadecimal.push_str(&format!("{test:016X?}"));
    }
    hexadecimal
 }
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_1_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_1_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_1_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.9.0-alpha.0"
+version = "0.10.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
@@ -15,3 +15,7 @@ keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
 cmake = { version = "0.1" }
 pkg-config = { version = "0.3" }
 bindgen = "0.71"
+
+[features]
+experimental-multi-arch = []
+profile = []
--- a/backends/tfhe-cuda-backend/build.rs
+++ b/backends/tfhe-cuda-backend/build.rs
@@ -37,7 +37,24 @@ fn main() {
            );
        }

-        let dest = cmake::build("cuda");
+        let mut cmake_config = cmake::Config::new("cuda");
+
+        // Conditionally pass the "MULTI_ARCH" variable to CMake if the feature is enabled
+        if cfg!(feature = "experimental-multi-arch") {
+            cmake_config.define("MULTI_ARCH", "ON");
+        } else {
+            cmake_config.define("MULTI_ARCH", "OFF");
+        }
+        // Conditionally pass the "USE_NVTOOLS" variable to CMake if the feature is enabled
+        if cfg!(feature = "profile") {
+            cmake_config.define("USE_NVTOOLS", "ON");
+            println!("cargo:rustc-link-lib=nvToolsExt");
+        } else {
+            cmake_config.define("USE_NVTOOLS", "OFF");
+        }
+
+        // Build the CMake project
+        let dest = cmake_config.build();
        println!("cargo:rustc-link-search=native={}", dest.display());
        println!("cargo:rustc-link-lib=static=tfhe_cuda_backend");

@@ -60,7 +77,9 @@ fn main() {
            "cuda/include/ciphertext.h",
            "cuda/include/integer/compression/compression.h",
            "cuda/include/integer/integer.h",
-            "cuda/include/keyswitch.h",
+            "cuda/include/zk/zk.h",
+            "cuda/include/keyswitch/keyswitch.h",
+            "cuda/include/keyswitch/ks_enums.h",
            "cuda/include/linear_algebra.h",
            "cuda/include/fft/fft128.h",
            "cuda/include/pbs/programmable_bootstrap.h",
@@ -74,7 +93,7 @@ fn main() {
        };
        let mut headers_modified = bindings_modified;
        for header in headers {
-            println!("cargo:rerun-if-changed={}", header);
+            println!("cargo:rerun-if-changed={header}");
            // Check modification times
            let header_modified = std::fs::metadata(header).unwrap().modified().unwrap();
            if header_modified > headers_modified {
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -34,7 +34,13 @@ else()
  set(CUDA_SUCCESS "FALSE")
 endif()

-if(${CUDA_SUCCESS})
+if(${MULTI_ARCH})
+  message(STATUS "Multi-architecture GPU build enabled, 75, 80, 86, 89")
+  message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
+  message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
+  message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
+  message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
+elseif(${CUDA_SUCCESS})
  message(STATUS "CUDA Architecture: ${ARCH}")
  message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
  message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
@@ -56,7 +62,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -g")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
-if(${CUDA_SUCCESS})
+if(${MULTI_ARCH})
+  set(CUDA_ARCH "750")
+  set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89)
+elseif(${CUDA_SUCCESS})
  set(CMAKE_CUDA_ARCHITECTURES native)
  string(REPLACE "-arch=sm_" "" CUDA_ARCH "${ARCH}")
  set(CUDA_ARCH "${CUDA_ARCH}0")
@@ -79,11 +88,18 @@ else()
  set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
 endif()

-# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
+# Check if the USE_NVTOOLS environment variable is set
+if(${USE_NVTOOLS})
+  message(STATUS "USE_NVTOOLS is enabled")
+  add_definitions(-DUSE_NVTOOLS)
+endif()
+
+# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
+# nvtx when profiling -lnvToolsExt
 set(CMAKE_CUDA_FLAGS
    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
-  --use_fast_math -Xcompiler -fPIC")
+  --use_fast_math -Xcompiler -fPIC ")

 set(INCLUDE_DIR include)

--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -4,6 +4,7 @@
 #include "stdint.h"

 extern "C" {
+
 void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *stream,
                                                  uint32_t gpu_index,
                                                  void *dest, void const *src,
@@ -20,5 +21,22 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
                                 uint32_t const *nth_array, uint32_t num_nths,
                                 uint32_t lwe_per_glwe, uint32_t glwe_dimension,
                                 uint32_t polynomial_size);
+
+void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
+                                    void *lwe_array_out, uint32_t size,
+                                    uint32_t log_modulus);
+
+void cuda_improve_noise_modulus_switch_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus);
+
+void cuda_glwe_sample_extract_128(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
+    uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
 }
+
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -8,7 +8,6 @@
 #include <cuda_runtime.h>
 #include <vector>

-#define synchronize_threads_in_block() __syncthreads()
 extern "C" {

 #define check_cuda_error(ans)                                                  \
@@ -48,22 +47,42 @@ uint32_t cuda_is_available();

 void *cuda_malloc(uint64_t size, uint32_t gpu_index);

+void *cuda_malloc_with_size_tracking_async(uint64_t size, cudaStream_t stream,
+                                           uint32_t gpu_index,
+                                           uint64_t *size_tracker,
+                                           bool allocate_gpu_memory);
+
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);

-void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
+bool cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
+
+void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
+                                                 uint64_t size,
+                                                 cudaStream_t stream,
+                                                 uint32_t gpu_index,
+                                                 bool gpu_memory_allocated);

 void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

+void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
+    void *dest, void const *src, uint64_t size, cudaStream_t stream,
+    uint32_t gpu_index, bool gpu_memory_allocated);
+
 void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                                  cudaStream_t stream, uint32_t gpu_index);

-void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
+void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
                            uint32_t gpu_index);

 void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cudaStream_t stream, uint32_t gpu_index);

+void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
+                                          uint64_t size, cudaStream_t stream,
+                                          uint32_t gpu_index,
+                                          bool gpu_memory_allocated);
+
 void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cudaStream_t stream, uint32_t gpu_index);

@@ -73,6 +92,10 @@ void cuda_synchronize_device(uint32_t gpu_index);

 void cuda_drop(void *ptr, uint32_t gpu_index);

+void cuda_drop_with_size_tracking_async(void *ptr, cudaStream_t stream,
+                                        uint32_t gpu_index,
+                                        bool gpu_memory_allocated);
+
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
 }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -4,7 +4,7 @@
 #include "../../pbs/pbs_enums.h"

 extern "C" {
-void scratch_cuda_integer_compress_radix_ciphertext_64(
+uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
@@ -13,15 +13,15 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
    bool allocate_gpu_memory);

-void scratch_cuda_integer_decompress_radix_ciphertext_64(
+uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t storage_log_modulus, uint32_t body_count,
-    bool allocate_gpu_memory);
+    uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
+    bool allocate_ms_array);

 void cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -14,41 +14,44 @@ template <typename Torus> struct int_compression {
  int8_t *fp_ks_buffer;
  Torus *tmp_lwe;
  Torus *tmp_glwe_array_out;
+  bool gpu_memory_allocated;

  int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                  uint32_t gpu_count, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
-                  uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+                  uint32_t storage_log_modulus, bool allocate_gpu_memory,
+                  uint64_t *size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
    this->lwe_per_glwe = lwe_per_glwe;
    this->storage_log_modulus = storage_log_modulus;
    this->body_count = num_radix_blocks;

-    if (allocate_gpu_memory) {
-      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
-                                    compression_params.polynomial_size;
+    Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                  compression_params.polynomial_size;

-      tmp_lwe = (Torus *)cuda_malloc_async(
-          num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
-              sizeof(Torus),
-          streams[0], gpu_indexes[0]);
-      tmp_glwe_array_out = (Torus *)cuda_malloc_async(
-          lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
-          gpu_indexes[0]);
+    tmp_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
+            sizeof(Torus),
+        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
+    tmp_glwe_array_out = (Torus *)cuda_malloc_with_size_tracking_async(
+        lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
+        gpu_indexes[0], size_tracker, allocate_gpu_memory);

-      scratch_packing_keyswitch_lwe_list_to_glwe_64(
-          streams[0], gpu_indexes[0], &fp_ks_buffer,
-          compression_params.small_lwe_dimension,
-          compression_params.glwe_dimension, compression_params.polynomial_size,
-          num_radix_blocks, true);
-    }
+    *size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
+        streams[0], gpu_indexes[0], &fp_ks_buffer,
+        compression_params.small_lwe_dimension,
+        compression_params.glwe_dimension, compression_params.polynomial_size,
+        num_radix_blocks, allocate_gpu_memory);
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
-    cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
-    cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
-                                               &fp_ks_buffer);
+    cuda_drop_with_size_tracking_async(tmp_lwe, streams[0], gpu_indexes[0],
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_glwe_array_out, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cleanup_packing_keyswitch_lwe_list_to_glwe(
+        streams[0], gpu_indexes[0], &fp_ks_buffer, gpu_memory_allocated);
  }
 };

@@ -66,66 +69,71 @@ template <typename Torus> struct int_decompression {
  uint32_t *tmp_indexes_array;

  int_radix_lut<Torus> *decompression_rescale_lut;
+  bool gpu_memory_allocated;

  int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                    uint32_t gpu_count, int_radix_params encryption_params,
                    int_radix_params compression_params,
                    uint32_t num_radix_blocks, uint32_t body_count,
-                    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+                    uint32_t storage_log_modulus, bool allocate_gpu_memory,
+                    uint64_t *size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
    this->storage_log_modulus = storage_log_modulus;
    this->num_radix_blocks = num_radix_blocks;
    this->body_count = body_count;

-    if (allocate_gpu_memory) {
-      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
-                                    compression_params.polynomial_size;
-      Torus lwe_accumulator_size = (compression_params.glwe_dimension *
-                                        compression_params.polynomial_size +
-                                    1);
-      decompression_rescale_lut = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, encryption_params, 1,
-          num_radix_blocks, allocate_gpu_memory);
+    Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                  compression_params.polynomial_size;
+    Torus lwe_accumulator_size = (compression_params.glwe_dimension *
+                                      compression_params.polynomial_size +
+                                  1);
+    decompression_rescale_lut = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, encryption_params, 1, num_radix_blocks,
+        allocate_gpu_memory, size_tracker);

-      tmp_extracted_glwe = (Torus *)cuda_malloc_async(
-          num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
-          gpu_indexes[0]);
-      tmp_indexes_array = (uint32_t *)cuda_malloc_async(
-          num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0]);
-      tmp_extracted_lwe = (Torus *)cuda_malloc_async(
-          num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
-          gpu_indexes[0]);
+    tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
+        gpu_indexes[0], size_tracker, allocate_gpu_memory);
+    tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0],
+        size_tracker, allocate_gpu_memory);
+    tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
+        num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
+        gpu_indexes[0], size_tracker, allocate_gpu_memory);

-      // Rescale is done using an identity LUT
-      // Here we do not divide by message_modulus
-      // Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
-      // space, we want to keep the original 2 bits value in the 4 bits space,
-      // so we apply the identity and the encoding will rescale it for us.
-      auto decompression_rescale_f = [](Torus x) -> Torus { return x; };
+    // Rescale is done using an identity LUT
+    // Here we do not divide by message_modulus
+    // Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
+    // space, we want to keep the original 2 bits value in the 4 bits space,
+    // so we apply the identity and the encoding will rescale it for us.
+    auto decompression_rescale_f = [](Torus x) -> Torus { return x; };

-      auto effective_compression_message_modulus =
-          encryption_params.carry_modulus;
-      auto effective_compression_carry_modulus = 1;
+    auto effective_compression_message_modulus =
+        encryption_params.carry_modulus;
+    auto effective_compression_carry_modulus = 1;

-      generate_device_accumulator_with_encoding<Torus>(
-          streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
-          decompression_rescale_lut->get_degree(0),
-          decompression_rescale_lut->get_max_degree(0),
-          encryption_params.glwe_dimension, encryption_params.polynomial_size,
-          effective_compression_message_modulus,
-          effective_compression_carry_modulus,
-          encryption_params.message_modulus, encryption_params.carry_modulus,
-          decompression_rescale_f);
+    generate_device_accumulator_with_encoding<Torus>(
+        streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
+        decompression_rescale_lut->get_degree(0),
+        decompression_rescale_lut->get_max_degree(0),
+        encryption_params.glwe_dimension, encryption_params.polynomial_size,
+        effective_compression_message_modulus,
+        effective_compression_carry_modulus, encryption_params.message_modulus,
+        encryption_params.carry_modulus, decompression_rescale_f,
+        gpu_memory_allocated);

-      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes, 0);
-    }
+    decompression_rescale_lut->broadcast_lut(streams, gpu_indexes, 0);
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
-    cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);
+    cuda_drop_with_size_tracking_async(tmp_extracted_glwe, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_extracted_lwe, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(tmp_indexes_array, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);

    decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
    delete decompression_rescale_lut;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -48,49 +48,53 @@ typedef struct {
  uint32_t lwe_dimension;
 } CudaRadixCiphertextFFI;

-void scratch_cuda_apply_univariate_lut_kb_64(
+uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
-void scratch_cuda_apply_many_univariate_lut_kb_64(
+    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);
+uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory);
+    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
+    bool allocate_ms_array);
 void cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks);

 void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
                                             uint32_t const *gpu_indexes,
                                             uint32_t gpu_count,
                                             int8_t **mem_ptr_void);

-void scratch_cuda_apply_bivariate_lut_kb_64(
+uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe_1,
    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
-    uint32_t shift);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_radix_blocks, uint32_t shift);

 void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
                                            uint32_t const *gpu_indexes,
@@ -101,44 +105,45 @@ void cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_luts,
-    uint32_t lut_stride);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_luts, uint32_t lut_stride);

-void scratch_cuda_full_propagation_64(
+uint64_t scratch_cuda_full_propagation_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

-void cuda_full_propagation_64_inplace(void *const *streams,
-                                      uint32_t const *gpu_indexes,
-                                      uint32_t gpu_count,
-                                      CudaRadixCiphertextFFI *input_blocks,
-                                      int8_t *mem_ptr, void *const *ksks,
-                                      void *const *bsks, uint32_t num_blocks);
+void cuda_full_propagation_64_inplace(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *input_blocks, int8_t *mem_ptr, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_blocks);

 void cleanup_cuda_full_propagation(void *const *streams,
                                   uint32_t const *gpu_indexes,
                                   uint32_t gpu_count, int8_t **mem_ptr_void);

-void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, bool const is_boolean_left, bool const is_boolean_right,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
    CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
-    void *const *bsks, void *const *ksks, int8_t *mem_ptr,
-    uint32_t polynomial_size, uint32_t num_blocks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);

 void cleanup_cuda_integer_mult(void *const *streams,
                               uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -156,33 +161,35 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void const *h_scalar_input, uint32_t num_scalars, uint32_t message_modulus,
    uint32_t carry_modulus);

-void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
+uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

-void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
+uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_logical_scalar_shift(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -192,87 +199,94 @@ void cleanup_cuda_integer_radix_arithmetic_scalar_shift(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_shift_and_rotate_kb_64(
+uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool is_signed, bool allocate_gpu_memory);
+    bool is_signed, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
                                                 uint32_t const *gpu_indexes,
                                                 uint32_t gpu_count,
                                                 int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_comparison_kb_64(
+uint64_t scratch_cuda_integer_radix_comparison_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory);
+    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
+    bool allocate_ms_array);

 void cuda_comparison_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
    void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t num_scalar_blocks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_scalar_blocks);

 void cleanup_cuda_integer_comparison(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count, int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_bitop_kb_64(
+uint64_t scratch_cuda_integer_radix_bitop_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    BITOP_TYPE op_type, bool allocate_gpu_memory);
+    BITOP_TYPE op_type, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_1,
    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
    void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_bitop(void *const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
                                int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_cmux_kb_64(
+uint64_t scratch_cuda_integer_radix_cmux_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_cmux_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -280,60 +294,65 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
    CudaRadixCiphertextFFI const *lwe_condition,
    CudaRadixCiphertextFFI const *lwe_array_true,
    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_cmux(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count, int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_scalar_rotate_kb_64(
+uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
                                              uint32_t const *gpu_indexes,
                                              uint32_t gpu_count,
                                              int8_t **mem_ptr_void);

-void scratch_cuda_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
-    uint32_t uses_carry, bool allocate_gpu_memory);
+    uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array);

-void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
+uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
-    uint32_t uses_carry, bool allocate_gpu_memory);
+    uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t requested_flag, uint32_t uses_carry);

 void cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
    uint32_t requested_flag, uint32_t uses_carry);

 void cleanup_cuda_propagate_single_carry(void *const *streams,
@@ -346,101 +365,107 @@ void cleanup_cuda_add_and_propagate_single_carry(void *const *streams,
                                                 uint32_t gpu_count,
                                                 int8_t **mem_ptr_void);

-void scratch_cuda_integer_overflowing_sub_kb_64_inplace(
+uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_overflowing_sub_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
    CudaRadixCiphertextFFI *overflow_block,
    const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t compute_overflow,
-    uint32_t uses_input_borrow);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t compute_overflow, uint32_t uses_input_borrow);

 void cleanup_cuda_integer_overflowing_sub(void *const *streams,
                                          uint32_t const *gpu_indexes,
                                          uint32_t gpu_count,
                                          int8_t **mem_ptr_void);

-void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *radix_lwe_out,
    CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);

-void scratch_cuda_integer_scalar_mul_kb_64(
+uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array, uint64_t const *decomposed_scalar,
    uint64_t const *has_at_least_one_set, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t polynomial_size, uint32_t message_modulus,
-    uint32_t num_scalars);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);

 void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
                                           uint32_t const *gpu_indexes,
                                           uint32_t gpu_count,
                                           int8_t **mem_ptr_void);

-void scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    bool is_signed, int8_t **mem_ptr, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *quotient, CudaRadixCiphertextFFI *remainder,
    CudaRadixCiphertextFFI const *numerator,
    CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_div_rem(void *const *streams,
                                  uint32_t const *gpu_indexes,
                                  uint32_t gpu_count, int8_t **mem_ptr_void);

-void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
+uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *output_radix_lwe,
    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks);
+    void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    void *const *bsks, uint32_t num_blocks);

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -451,58 +476,63 @@ void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
                                            uint32_t gpu_count,
                                            CudaRadixCiphertextFFI *lwe_array);

-void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
+uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, bool is_signed, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t big_lwe_dimension,
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
-    void *const *bsks, void *const *ksks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

 void cleanup_cuda_integer_abs_inplace(void *const *streams,
                                      uint32_t const *gpu_indexes,
                                      uint32_t gpu_count,
                                      int8_t **mem_ptr_void);

-void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
+uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_are_all_comparisons_block_true_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks);

 void cleanup_cuda_integer_are_all_comparisons_block_true(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);

-void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
+    bool allocate_gpu_memory, bool allocate_ms_array);

 void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *lwe_array_out,
    CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
+    void *const *bsks, void *const *ksks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
+    uint32_t num_radix_blocks);

 void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
@@ -3,9 +3,13 @@

 void release_radix_ciphertext_async(cudaStream_t const stream,
                                    uint32_t const gpu_index,
-                                    CudaRadixCiphertextFFI *data);
+                                    CudaRadixCiphertextFFI *data,
+                                    const bool gpu_memory_allocated);

 void reset_radix_ciphertext_blocks(CudaRadixCiphertextFFI *data,
                                   uint32_t new_num_blocks);

+void into_radix_ciphertext(CudaRadixCiphertextFFI *radix, void *lwe_array,
+                           const uint32_t num_radix_blocks,
+                           const uint32_t lwe_dimension);
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h
@@ -19,7 +19,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples);

-void scratch_packing_keyswitch_lwe_list_to_glwe_64(
+uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t num_lwes, bool allocate_gpu_memory);
@@ -31,9 +31,22 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_lwes);

+void scratch_packing_keyswitch_lwe_list_to_glwe_128(
+    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_lwes, bool allocate_gpu_memory);
+
+void cuda_packing_keyswitch_lwe_list_to_glwe_128(
+    void *stream, uint32_t gpu_index, void *glwe_array_out,
+    void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
+    uint32_t input_lwe_dimension, uint32_t output_glwe_dimension,
+    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_lwes);
+
 void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
                                                uint32_t gpu_index,
-                                                int8_t **fp_ks_buffer);
+                                                int8_t **fp_ks_buffer,
+                                                bool gpu_memory_allocated);
 }

 #endif // CNCRT_KS_H_
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch/ks_enums.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch/ks_enums.h
@@ -0,0 +1,6 @@
+#ifndef CUDA_KS_ENUMS_H
+#define CUDA_KS_ENUMS_H
+
+enum KS_TYPE { BIG_TO_SMALL = 0, SMALL_TO_BIG = 1 };
+
+#endif // CUDA_KS_ENUMS_H
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -42,6 +42,24 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
    void const *lwe_array_in, void const *cleartext_array_in,
    const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);
+
+void scratch_wrapping_polynomial_mul_one_to_many_64(void *stream,
+                                                    uint32_t gpu_index,
+                                                    uint32_t polynomial_size,
+                                                    int8_t **circulant_buf);
+
+void cleanup_wrapping_polynomial_mul_one_to_many_64(void *stream,
+                                                    uint32_t gpu_index,
+                                                    int8_t *circulant_buf);
+
+void cuda_wrapping_polynomial_mul_one_to_many_64(
+    void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
+    int8_t *circulant, void const *poly_rhs, uint32_t polynomial_size,
+    uint32_t n_rhs);
+void cuda_glwe_wrapping_polynomial_mul_one_to_many_64(
+    void *stream, uint32_t gpu_index, void *result, void const *poly_lhs,
+    int8_t *circulant, void const *poly_rhs, uint32_t polynomial_size,
+    uint32_t glwe_dimension, uint32_t n_rhs);
 void cuda_add_lwe_ciphertext_vector_plaintext_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, const uint64_t plaintext_in,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
@@ -1,7 +1,17 @@
 #ifndef CUDA_PBS_ENUMS_H
 #define CUDA_PBS_ENUMS_H
-
+#include <stdint.h>
 enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
 enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };

+extern "C" {
+typedef struct {
+  void *const *ptr;
+  uint32_t num_zeros;
+  double ms_bound;
+  double ms_r_sigma;
+  double ms_input_variance;
+} CudaModulusSwitchNoiseReductionKeyFFI;
+}
+
 #endif // CUDA_PBS_ENUMS_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -14,7 +14,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(

 #if CUDA_ARCH >= 900
 template <typename Torus>
-void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
+uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
@@ -32,7 +32,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
 #endif

 template <typename Torus>
-void scratch_cuda_cg_multi_bit_programmable_bootstrap(
+uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
@@ -49,7 +49,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
    uint32_t num_many_lut, uint32_t lut_stride);

 template <typename Torus>
-void scratch_cuda_multi_bit_programmable_bootstrap(
+uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
@@ -109,11 +109,14 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
  double2 *global_join_buffer;

  PBS_VARIANT pbs_variant;
+  bool gpu_memory_allocated;

  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
-             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
+             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
+             uint64_t *size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);

    this->pbs_variant = pbs_variant;
@@ -164,107 +167,117 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
    auto num_blocks_acc_tbc = num_blocks_acc_cg;
 #endif

-    if (allocate_gpu_memory) {
-      // Keybundle
-      if (max_shared_memory < full_sm_keybundle)
-        d_mem_keybundle = (int8_t *)cuda_malloc_async(
-            num_blocks_keybundle * full_sm_keybundle, stream, gpu_index);
+    // Keybundle
+    if (max_shared_memory < full_sm_keybundle)
+      d_mem_keybundle = (int8_t *)cuda_malloc_with_size_tracking_async(
+          num_blocks_keybundle * full_sm_keybundle, stream, gpu_index,
+          size_tracker, allocate_gpu_memory);

-      switch (pbs_variant) {
-      case PBS_VARIANT::CG:
-        // Accumulator CG
-        if (max_shared_memory < partial_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index);
-        else if (max_shared_memory < full_sm_cg_accumulate)
-          d_mem_acc_cg = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index);
-        break;
-      case PBS_VARIANT::DEFAULT:
-        // Accumulator step one
-        if (max_shared_memory < partial_sm_accumulate_step_one)
-          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
-              gpu_index);
-        else if (max_shared_memory < full_sm_accumulate_step_one)
-          d_mem_acc_step_one = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
-              gpu_index);
-
-        // Accumulator step two
-        if (max_shared_memory < full_sm_accumulate_step_two)
-          d_mem_acc_step_two = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
-              gpu_index);
-        break;
-#if CUDA_ARCH >= 900
-      case TBC:
-        // There is a minimum amount of memory we need to run the TBC PBS, which
-        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
-        // because otherwise the previous check would have redirected
-        // computation to some other variant. If over that we don't have more
-        // partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
-        // partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
-        // will run on PARTIALSM. Otherwise, FULLSM.
-        //
-        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
-
-        // Accumulator TBC
-        if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
-          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index);
-        else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
-          d_mem_acc_tbc = (int8_t *)cuda_malloc_async(
-              num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream,
-              gpu_index);
-        break;
-#endif
-      default:
-        PANIC("Cuda error (PBS): unsupported implementation variant.")
-      }
-
-      keybundle_fft = (double2 *)cuda_malloc_async(
-          num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
-          stream, gpu_index);
-      global_accumulator = (Torus *)cuda_malloc_async(
-          input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
-              sizeof(Torus),
-          stream, gpu_index);
-      global_join_buffer = (double2 *)cuda_malloc_async(
-          level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
-              (polynomial_size / 2) * sizeof(double2),
-          stream, gpu_index);
-    }
-  }
-
-  void release(cudaStream_t stream, uint32_t gpu_index) {
-
-    if (d_mem_keybundle)
-      cuda_drop_async(d_mem_keybundle, stream, gpu_index);
    switch (pbs_variant) {
-    case DEFAULT:
-      if (d_mem_acc_step_one)
-        cuda_drop_async(d_mem_acc_step_one, stream, gpu_index);
-      if (d_mem_acc_step_two)
-        cuda_drop_async(d_mem_acc_step_two, stream, gpu_index);
+    case PBS_VARIANT::CG:
+      // Accumulator CG
+      if (max_shared_memory < partial_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * full_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_cg_accumulate)
+        d_mem_acc_cg = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_cg * partial_sm_cg_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      break;
-    case CG:
-      if (d_mem_acc_cg)
-        cuda_drop_async(d_mem_acc_cg, stream, gpu_index);
+    case PBS_VARIANT::DEFAULT:
+      // Accumulator step one
+      if (max_shared_memory < partial_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * full_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_accumulate_step_one)
+        d_mem_acc_step_one = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_one * partial_sm_accumulate_step_one, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
+
+      // Accumulator step two
+      if (max_shared_memory < full_sm_accumulate_step_two)
+        d_mem_acc_step_two = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_step_two * full_sm_accumulate_step_two, stream,
+            gpu_index, size_tracker, allocate_gpu_memory);
      break;
 #if CUDA_ARCH >= 900
    case TBC:
-      if (d_mem_acc_tbc)
-        cuda_drop_async(d_mem_acc_tbc, stream, gpu_index);
+      // There is a minimum amount of memory we need to run the TBC PBS, which
+      // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+      // because otherwise the previous check would have redirected
+      // computation to some other variant. If over that we don't have more
+      // partial_sm_tbc_accumulate bytes, TBC PBS will run on NOSM. If we have
+      // partial_sm_tbc_accumulate but not full_sm_tbc_accumulate bytes, it
+      // will run on PARTIALSM. Otherwise, FULLSM.
+      //
+      // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+
+      // Accumulator TBC
+      if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc)
+        d_mem_acc_tbc = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_tbc * full_sm_tbc_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
+      else if (max_shared_memory < full_sm_tbc_accumulate + minimum_sm_tbc)
+        d_mem_acc_tbc = (int8_t *)cuda_malloc_with_size_tracking_async(
+            num_blocks_acc_tbc * partial_sm_tbc_accumulate, stream, gpu_index,
+            size_tracker, allocate_gpu_memory);
      break;
 #endif
    default:
      PANIC("Cuda error (PBS): unsupported implementation variant.")
    }

-    cuda_drop_async(keybundle_fft, stream, gpu_index);
-    cuda_drop_async(global_accumulator, stream, gpu_index);
-    cuda_drop_async(global_join_buffer, stream, gpu_index);
+    keybundle_fft = (double2 *)cuda_malloc_with_size_tracking_async(
+        num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2), stream,
+        gpu_index, size_tracker, allocate_gpu_memory);
+    global_accumulator = (Torus *)cuda_malloc_with_size_tracking_async(
+        input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
+            sizeof(Torus),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+    global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
+        level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
+            (polynomial_size / 2) * sizeof(double2),
+        stream, gpu_index, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t stream, uint32_t gpu_index) {
+
+    if (d_mem_keybundle)
+      cuda_drop_with_size_tracking_async(d_mem_keybundle, stream, gpu_index,
+                                         gpu_memory_allocated);
+    switch (pbs_variant) {
+    case DEFAULT:
+      if (d_mem_acc_step_one)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_one, stream,
+                                           gpu_index, gpu_memory_allocated);
+      if (d_mem_acc_step_two)
+        cuda_drop_with_size_tracking_async(d_mem_acc_step_two, stream,
+                                           gpu_index, gpu_memory_allocated);
+      break;
+    case CG:
+      if (d_mem_acc_cg)
+        cuda_drop_with_size_tracking_async(d_mem_acc_cg, stream, gpu_index,
+                                           gpu_memory_allocated);
+      break;
+#if CUDA_ARCH >= 900
+    case TBC:
+      if (d_mem_acc_tbc)
+        cuda_drop_with_size_tracking_async(d_mem_acc_tbc, stream, gpu_index,
+                                           gpu_memory_allocated);
+      break;
+#endif
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
+    }
+
+    cuda_drop_with_size_tracking_async(keybundle_fft, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
+                                       gpu_memory_allocated);
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -54,15 +54,19 @@ uint64_t get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap(
 template <typename Torus>
 uint64_t
 get_buffer_size_full_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
+  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
+  return sizeof(Torus) * polynomial_size + // accumulator_rotated
+         sizeof(Torus) * polynomial_size + // accumulator
+         sizeof(double) * polynomial_size / 2 * 2 *
+             double_count; // accumulator fft
 }

 template <typename Torus>
 uint64_t
 get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
+  size_t double_count = (sizeof(Torus) == 16) ? 2 : 1;
+  return sizeof(double) * polynomial_size / 2 * 2 *
+         double_count; // accumulator fft mask & body
 }

 template <typename Torus>
@@ -76,148 +80,163 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

  Torus *global_accumulator;
  double2 *global_join_buffer;
+  Torus *temp_lwe_array_in;

  PBS_VARIANT pbs_variant;
+  bool uses_noise_reduction;
+  bool gpu_memory_allocated;

-  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t glwe_dimension,
-             uint32_t polynomial_size, uint32_t level_count,
-             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
-             bool allocate_gpu_memory) {
+  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t lwe_dimension,
+             uint32_t glwe_dimension, uint32_t polynomial_size,
+             uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
+             bool allocate_ms_array, uint64_t *size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
+    this->uses_noise_reduction = allocate_ms_array;
    this->pbs_variant = pbs_variant;

    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+    this->temp_lwe_array_in = (Torus *)cuda_malloc_with_size_tracking_async(
+        (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(Torus),
+        stream, gpu_index, size_tracker, allocate_ms_array);
+    switch (pbs_variant) {
+    case PBS_VARIANT::DEFAULT: {
+      uint64_t full_sm_step_one =
+          get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
+              polynomial_size);
+      uint64_t full_sm_step_two =
+          get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
+              polynomial_size);
+      uint64_t partial_sm =
+          get_buffer_size_partial_sm_programmable_bootstrap<Torus>(
+              polynomial_size);

-    if (allocate_gpu_memory) {
-      switch (pbs_variant) {
-      case PBS_VARIANT::DEFAULT: {
-        uint64_t full_sm_step_one =
-            get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
-                polynomial_size);
-        uint64_t full_sm_step_two =
-            get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
-                polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap<Torus>(
-                polynomial_size);
+      uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+      uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+      uint64_t full_dm = full_sm_step_one;

-        uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
-        uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
-        uint64_t full_dm = full_sm_step_one;
-
-        uint64_t device_mem = 0;
-        if (max_shared_memory < partial_sm) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm_step_two) {
-          device_mem =
-              (partial_dm_step_two + partial_dm_step_one * level_count) *
-              input_lwe_ciphertext_count * (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm_step_one) {
-          device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
-                       level_count * (glwe_dimension + 1);
-        }
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
-
-        global_join_buffer = (double2 *)cuda_malloc_async(
-            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
-                (polynomial_size / 2) * sizeof(double2),
-            stream, gpu_index);
-
-        global_accumulator = (Torus *)cuda_malloc_async(
-            (glwe_dimension + 1) * input_lwe_ciphertext_count *
-                polynomial_size * sizeof(Torus),
-            stream, gpu_index);
-      } break;
-      case PBS_VARIANT::CG: {
-        uint64_t full_sm =
-            get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(
-                polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
-                polynomial_size);
-
-        uint64_t partial_dm = full_sm - partial_sm;
-        uint64_t full_dm = full_sm;
-        uint64_t device_mem = 0;
-
-        if (max_shared_memory < partial_sm) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm) {
-          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        }
-
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
-
-        global_join_buffer = (double2 *)cuda_malloc_async(
-            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
-                polynomial_size / 2 * sizeof(double2),
-            stream, gpu_index);
-      } break;
-#if CUDA_ARCH >= 900
-      case PBS_VARIANT::TBC: {
-
-        bool supports_dsm =
-            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                Torus>(polynomial_size, max_shared_memory);
-
-        uint64_t full_sm =
-            get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
-                polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
-                polynomial_size);
-        uint64_t minimum_sm_tbc = 0;
-        if (supports_dsm)
-          minimum_sm_tbc =
-              get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
-                  Torus>(polynomial_size);
-
-        uint64_t partial_dm = full_sm - partial_sm;
-        uint64_t full_dm = full_sm;
-        uint64_t device_mem = 0;
-
-        // There is a minimum amount of memory we need to run the TBC PBS, which
-        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
-        // because otherwise the previous check would have redirected
-        // computation to some other variant. If over that we don't have more
-        // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
-        // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
-        //
-        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
-        if (max_shared_memory < partial_sm + minimum_sm_tbc) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
-          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        }
-
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
-
-        global_join_buffer = (double2 *)cuda_malloc_async(
-            (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
-                polynomial_size / 2 * sizeof(double2),
-            stream, gpu_index);
-      } break;
-#endif
-      default:
-        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      uint64_t device_mem = 0;
+      if (max_shared_memory < partial_sm) {
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm_step_two) {
+        device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
+                     input_lwe_ciphertext_count * (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm_step_one) {
+        device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                     level_count * (glwe_dimension + 1);
      }
+      // Otherwise, both kernels run all in shared memory
+      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
+          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);
+
+      global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
+          (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+              (polynomial_size / 2) * sizeof(double2),
+          stream, gpu_index, size_tracker, allocate_gpu_memory);
+
+      global_accumulator = (Torus *)cuda_malloc_with_size_tracking_async(
+          (glwe_dimension + 1) * input_lwe_ciphertext_count * polynomial_size *
+              sizeof(Torus),
+          stream, gpu_index, size_tracker, allocate_gpu_memory);
+    } break;
+    case PBS_VARIANT::CG: {
+      uint64_t full_sm =
+          get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(
+              polynomial_size);
+      uint64_t partial_sm =
+          get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
+              polynomial_size);
+
+      uint64_t partial_dm = full_sm - partial_sm;
+      uint64_t full_dm = full_sm;
+      uint64_t device_mem = 0;
+
+      if (max_shared_memory < partial_sm) {
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm) {
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      }
+
+      // Otherwise, both kernels run all in shared memory
+      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
+          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);
+
+      global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
+          (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+              polynomial_size / 2 * sizeof(double2),
+          stream, gpu_index, size_tracker, allocate_gpu_memory);
+    } break;
+#if CUDA_ARCH >= 900
+    case PBS_VARIANT::TBC: {
+
+      bool supports_dsm =
+          supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+              Torus>(polynomial_size, max_shared_memory);
+
+      uint64_t full_sm =
+          get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
+              polynomial_size);
+      uint64_t partial_sm =
+          get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
+              polynomial_size);
+      uint64_t minimum_sm_tbc = 0;
+      if (supports_dsm)
+        minimum_sm_tbc =
+            get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
+                Torus>(polynomial_size);
+
+      uint64_t partial_dm = full_sm - partial_sm;
+      uint64_t full_dm = full_sm;
+      uint64_t device_mem = 0;
+
+      // There is a minimum amount of memory we need to run the TBC PBS, which
+      // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+      // because otherwise the previous check would have redirected
+      // computation to some other variant. If over that we don't have more
+      // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
+      // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
+      //
+      // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+      if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      }
+
+      // Otherwise, both kernels run all in shared memory
+      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
+          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);
+
+      global_join_buffer = (double2 *)cuda_malloc_with_size_tracking_async(
+          (glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
+              polynomial_size / 2 * sizeof(double2),
+          stream, gpu_index, size_tracker, allocate_gpu_memory);
+    } break;
+#endif
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
    }
  }

  void release(cudaStream_t stream, uint32_t gpu_index) {
-    cuda_drop_async(d_mem, stream, gpu_index);
-    cuda_drop_async(global_join_buffer, stream, gpu_index);
+    cuda_drop_with_size_tracking_async(d_mem, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
+                                       gpu_memory_allocated);

    if (pbs_variant == DEFAULT)
-      cuda_drop_async(global_accumulator, stream, gpu_index);
+      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
+                                         gpu_memory_allocated);
+
+    if (uses_noise_reduction)
+      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
+                                         gpu_memory_allocated);
  }
 };

@@ -228,145 +247,186 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {

  __uint128_t *global_accumulator;
  double *global_join_buffer;
+  __uint128_t *temp_lwe_array_in;
+  uint64_t *trivial_indexes;

  PBS_VARIANT pbs_variant;
+  bool uses_noise_reduction;
+  bool gpu_memory_allocated;

  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-                 PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
+                 uint32_t lwe_dimension, uint32_t glwe_dimension,
+                 uint32_t polynomial_size, uint32_t level_count,
+                 uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
+                 bool allocate_gpu_memory, bool allocate_ms_array,
+                 uint64_t *size_tracker) {
+    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
+    this->uses_noise_reduction = allocate_ms_array;
+    if (allocate_ms_array) {
+      this->temp_lwe_array_in =
+          (__uint128_t *)cuda_malloc_with_size_tracking_async(
+              (lwe_dimension + 1) * input_lwe_ciphertext_count *
+                  sizeof(__uint128_t),
+              stream, gpu_index, size_tracker, allocate_ms_array);
+      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          size_tracker, allocate_ms_array);
+      uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
+      for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
+        h_trivial_indexes[i] = i;

+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          trivial_indexes, h_trivial_indexes,
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          allocate_gpu_memory);
+
+      cuda_synchronize_stream(stream, gpu_index);
+      delete[] h_trivial_indexes;
+    }
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
    size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
                                     input_lwe_ciphertext_count *
                                     polynomial_size / 2 * sizeof(double) * 4;

-    if (allocate_gpu_memory) {
-      switch (pbs_variant) {
-      case PBS_VARIANT::DEFAULT: {
-        uint64_t full_sm_step_one =
-            get_buffer_size_full_sm_programmable_bootstrap_step_one<
-                __uint128_t>(polynomial_size);
-        uint64_t full_sm_step_two =
-            get_buffer_size_full_sm_programmable_bootstrap_step_two<
-                __uint128_t>(polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
-                polynomial_size);
+    switch (pbs_variant) {
+    case PBS_VARIANT::DEFAULT: {
+      uint64_t full_sm_step_one =
+          get_buffer_size_full_sm_programmable_bootstrap_step_one<__uint128_t>(
+              polynomial_size);
+      uint64_t full_sm_step_two =
+          get_buffer_size_full_sm_programmable_bootstrap_step_two<__uint128_t>(
+              polynomial_size);
+      uint64_t partial_sm =
+          get_buffer_size_partial_sm_programmable_bootstrap<__uint128_t>(
+              polynomial_size);

-        uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
-        uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
-        uint64_t full_dm = full_sm_step_one;
+      uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
+      uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
+      uint64_t full_dm = full_sm_step_one;

-        uint64_t device_mem = 0;
-        if (max_shared_memory < partial_sm) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm_step_two) {
-          device_mem =
-              (partial_dm_step_two + partial_dm_step_one * level_count) *
-              input_lwe_ciphertext_count * (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm_step_one) {
-          device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
-                       level_count * (glwe_dimension + 1);
-        }
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
-
-        global_join_buffer = (double *)cuda_malloc_async(
-            global_join_buffer_size, stream, gpu_index);
-
-        global_accumulator = (__uint128_t *)cuda_malloc_async(
-            (glwe_dimension + 1) * input_lwe_ciphertext_count *
-                polynomial_size * sizeof(__uint128_t),
-            stream, gpu_index);
-      } break;
-      case PBS_VARIANT::CG: {
-        uint64_t full_sm =
-            get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
-                polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap_cg<__uint128_t>(
-                polynomial_size);
-
-        uint64_t partial_dm = full_sm - partial_sm;
-        uint64_t full_dm = full_sm;
-        uint64_t device_mem = 0;
-
-        if (max_shared_memory < partial_sm) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm) {
-          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        }
-
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
-
-        global_join_buffer = (double *)cuda_malloc_async(
-            global_join_buffer_size, stream, gpu_index);
-      } break;
-#if CUDA_ARCH >= 900
-      case PBS_VARIANT::TBC: {
-
-        bool supports_dsm =
-            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                __uint128_t>(polynomial_size, max_shared_memory);
-
-        uint64_t full_sm =
-            get_buffer_size_full_sm_programmable_bootstrap_tbc<__uint128_t>(
-                polynomial_size);
-        uint64_t partial_sm =
-            get_buffer_size_partial_sm_programmable_bootstrap_tbc<__uint128_t>(
-                polynomial_size);
-        uint64_t minimum_sm_tbc = 0;
-        if (supports_dsm)
-          minimum_sm_tbc =
-              get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
-                  __uint128_t>(polynomial_size);
-
-        uint64_t partial_dm = full_sm - partial_sm;
-        uint64_t full_dm = full_sm;
-        uint64_t device_mem = 0;
-
-        // There is a minimum amount of memory we need to run the TBC PBS, which
-        // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
-        // because otherwise the previous check would have redirected
-        // computation to some other variant. If over that we don't have more
-        // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
-        // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
-        //
-        // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
-        if (max_shared_memory < partial_sm + minimum_sm_tbc) {
-          device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
-          device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                       (glwe_dimension + 1);
-        }
-
-        // Otherwise, both kernels run all in shared memory
-        d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
-
-        global_join_buffer = (double *)cuda_malloc_async(
-            global_join_buffer_size, stream, gpu_index);
-      } break;
-#endif
-      default:
-        PANIC("Cuda error (PBS): unsupported implementation variant.")
+      uint64_t device_mem = 0;
+      if (max_shared_memory < partial_sm) {
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm_step_two) {
+        device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
+                     input_lwe_ciphertext_count * (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm_step_one) {
+        device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
+                     level_count * (glwe_dimension + 1);
      }
+      // Otherwise, both kernels run all in shared memory
+      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
+          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);
+
+      global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
+          global_join_buffer_size, stream, gpu_index, size_tracker,
+          allocate_gpu_memory);
+
+      global_accumulator = (__uint128_t *)cuda_malloc_with_size_tracking_async(
+          (glwe_dimension + 1) * input_lwe_ciphertext_count * polynomial_size *
+              sizeof(__uint128_t),
+          stream, gpu_index, size_tracker, allocate_gpu_memory);
+    } break;
+    case PBS_VARIANT::CG: {
+      uint64_t full_sm =
+          get_buffer_size_full_sm_programmable_bootstrap_cg<__uint128_t>(
+              polynomial_size);
+      uint64_t partial_sm =
+          get_buffer_size_partial_sm_programmable_bootstrap_cg<__uint128_t>(
+              polynomial_size);
+
+      uint64_t partial_dm = full_sm - partial_sm;
+      uint64_t full_dm = full_sm;
+      uint64_t device_mem = 0;
+
+      if (max_shared_memory < partial_sm) {
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm) {
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      }
+
+      // Otherwise, both kernels run all in shared memory
+      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
+          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);
+
+      global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
+          global_join_buffer_size, stream, gpu_index, size_tracker,
+          allocate_gpu_memory);
+    } break;
+#if CUDA_ARCH >= 900
+    case PBS_VARIANT::TBC: {
+
+      bool supports_dsm =
+          supports_distributed_shared_memory_on_classic_programmable_bootstrap<
+              __uint128_t>(polynomial_size, max_shared_memory);
+
+      uint64_t full_sm =
+          get_buffer_size_full_sm_programmable_bootstrap_tbc<__uint128_t>(
+              polynomial_size);
+      uint64_t partial_sm =
+          get_buffer_size_partial_sm_programmable_bootstrap_tbc<__uint128_t>(
+              polynomial_size);
+      uint64_t minimum_sm_tbc = 0;
+      if (supports_dsm)
+        minimum_sm_tbc =
+            get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<
+                __uint128_t>(polynomial_size);
+
+      uint64_t partial_dm = full_sm - partial_sm;
+      uint64_t full_dm = full_sm;
+      uint64_t device_mem = 0;
+
+      // There is a minimum amount of memory we need to run the TBC PBS, which
+      // is minimum_sm_tbc. We know that minimum_sm_tbc bytes are available
+      // because otherwise the previous check would have redirected
+      // computation to some other variant. If over that we don't have more
+      // partial_sm bytes, TBC PBS will run on NOSM. If we have partial_sm but
+      // not full_sm bytes, it will run on PARTIALSM. Otherwise, FULLSM.
+      //
+      // NOSM mode actually requires minimum_sm_tbc shared memory bytes.
+      if (max_shared_memory < partial_sm + minimum_sm_tbc) {
+        device_mem = full_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
+        device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
+                     (glwe_dimension + 1);
+      }
+
+      // Otherwise, both kernels run all in shared memory
+      d_mem = (int8_t *)cuda_malloc_with_size_tracking_async(
+          device_mem, stream, gpu_index, size_tracker, allocate_gpu_memory);
+
+      global_join_buffer = (double *)cuda_malloc_with_size_tracking_async(
+          global_join_buffer_size, stream, gpu_index, size_tracker,
+          allocate_gpu_memory);
+    } break;
+#endif
+    default:
+      PANIC("Cuda error (PBS): unsupported implementation variant.")
    }
  }

  void release(cudaStream_t stream, uint32_t gpu_index) {
-    cuda_drop_async(d_mem, stream, gpu_index);
-    cuda_drop_async(global_join_buffer, stream, gpu_index);
+    cuda_drop_with_size_tracking_async(d_mem, stream, gpu_index,
+                                       gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(global_join_buffer, stream, gpu_index,
+                                       gpu_memory_allocated);

    if (pbs_variant == DEFAULT)
-      cuda_drop_async(global_accumulator, stream, gpu_index);
+      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
+                                         gpu_memory_allocated);
+
+    if (uses_noise_reduction) {
+      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
+                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
+                                         gpu_memory_allocated);
+    }
  }
 };

@@ -437,23 +497,26 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
    uint32_t lut_stride);

 template <typename Torus>
-void scratch_cuda_programmable_bootstrap_tbc(
+uint64_t scratch_cuda_programmable_bootstrap_tbc(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array);
 #endif

 template <typename Torus>
-void scratch_cuda_programmable_bootstrap_cg(
+uint64_t scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array);

 template <typename Torus>
-void scratch_cuda_programmable_bootstrap(
+uint64_t scratch_cuda_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
+    bool allocate_gpu_memory, bool allocate_ms_array);

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
--- a/Show More
+++ b/Show More