fix(bench): fix bench params

feat(gpu): implement the centered modulus switch technique to classical PBS
feat(gpu): implement 128-bit compression and add it to the integer API
2026-01-11 07:38:08 -05:00 · 2025-08-29 17:15:16 +02:00 · 2025-08-29 11:38:26 -03:00 · 2025-08-29 11:26:07 -03:00 · 2025-08-29 10:19:45 +02:00 · 2025-08-28 17:54:59 +02:00
492 changed files with 20421 additions and 5437 deletions
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -23,38 +23,58 @@ runs:
        echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
        sha256sum -c checksum
        sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
+        sudo apt remove -y unattended-upgrades
        sudo apt update
        sudo apt install -y cmake-format libclang-dev
      env:
        CMAKE_VERSION: 3.29.6
        CMAKE_SCRIPT_SHA: "6e4fada5cba3472ae503a11232b6580786802f0879cead2741672bf65d97488a"

+    - name: Install GCC
+      if: inputs.github-instance == 'true'
+      shell: bash
+      env:
+        GCC_VERSION: ${{ inputs.gcc-version }}
+      run: |
+        sudo apt-get install gcc-"{GCC_VERSION}" g++-"{GCC_VERSION}"
+        sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"{GCC_VERSION}" 20
+        sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"{GCC_VERSION}" 20
+
+    - name: Check GCC
+      shell: bash
+      env:
+        GCC_VERSION: ${{ inputs.gcc-version }}
+      run: |
+        which gcc-"${GCC_VERSION}"
+
    - name: Install CUDA
      if: inputs.github-instance == 'true'
      shell: bash
+      env:
+        CUDA_VERSION: ${{ inputs.cuda-version }}
+        CUDA_KEYRING_PACKAGE: cuda-keyring_1.1-1_all.deb
+        CUDA_KEYRING_SHA: "d93190d50b98ad4699ff40f4f7af50f16a76dac3bb8da1eaaf366d47898ff8df"
      run: |
        # Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
        # shellcheck disable=SC2001
        TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${env.CUDA_KEYRING_PACKAGE}
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${CUDA_KEYRING_PACKAGE}
        echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
        sha256sum -c checksum
        sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
        sudo apt update
        sudo apt -y install cuda-toolkit-"${TOOLKIT_VERSION}"
-      env:
-        CUDA_VERSION: ${{ inputs.cuda-version }}
-        CUDA_KEYRING_PACKAGE: cuda-keyring_1.1-1_all.deb
-        CUDA_KEYRING_SHA: "d93190d50b98ad4699ff40f4f7af50f16a76dac3bb8da1eaaf366d47898ff8df"

    - name: Export CUDA variables
      shell: bash
      run: |
+        find /usr/local -executable -name "nvcc"
        CUDA_PATH=/usr/local/cuda-"${CUDA_VERSION}"
        {
          echo "CUDA_PATH=$CUDA_PATH";
          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH";
          echo "CUDA_MODULE_LOADER=EAGER";
+          echo "PATH=$PATH:$CUDA_PATH/bin"; 
        } >> "${GITHUB_ENV}"
        {
          echo "PATH=$PATH:$CUDA_PATH/bin"; 
@@ -74,6 +94,11 @@ runs:
      env:
        GCC_VERSION: ${{ inputs.gcc-version }}

+    - name: Check setup
+      shell: bash
+      run: |
+        which nvcc
+
    - name: Check device is detected
      shell: bash
      run: nvidia-smi
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -61,13 +61,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -80,7 +80,7 @@ jobs:

      - name: Retrieve data from cache
        id: retrieve-data-cache
-        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
@@ -99,7 +99,7 @@ jobs:
      - name: Store data in cache
        if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
        continue-on-error: true
-        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            utils/tfhe-backward-compat-data/**/*.cbor
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -60,7 +60,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -168,13 +168,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -216,7 +216,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            ~/.nvm
@@ -229,7 +229,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -47,7 +47,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -108,13 +108,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_noise_checks.yml
+++ b/.github/workflows/aws_tfhe_noise_checks.yml
@@ -0,0 +1,115 @@
+name: Run noise checks on CPU
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  setup-instance:
+    name: Setup instance (noise-checks)
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          # We want an hpc7a more compute, will be faster
+          profile: bench
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "Cannot run this without secrets"
+          exit 1
+
+  noise-checks:
+    name: CPU noise checks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 1440
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
+      - name: Run noise checks
+        timeout-minutes: 1440
+        run: |
+          make test_noise_check
+
+      - name: Set pull-request URL
+        if: ${{ !success() }}
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Slack Notification
+        if: ${{ !success() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Noise checks tests finished with status: ${{ job.status }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: Teardown instance (noise-checks)
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, noise-checks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ !success() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (noise-checks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -48,7 +48,7 @@ jobs:
        steps.changed-files.outputs.integer_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -109,13 +109,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -69,7 +69,7 @@ jobs:
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -179,13 +179,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -62,13 +62,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -78,7 +78,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            ~/.nvm
@@ -91,7 +91,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -50,7 +50,7 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -68,7 +68,7 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -107,7 +107,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -50,7 +50,7 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -68,7 +68,7 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -100,7 +100,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -50,7 +50,7 @@ jobs:
    timeout-minutes: 720  # 12 hours
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -68,12 +68,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -51,7 +51,7 @@ jobs:
    timeout-minutes: 720  # 12 hours
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -69,12 +69,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -38,7 +38,7 @@ jobs:
    timeout-minutes: 1440 # 24 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -57,12 +57,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
@@ -122,7 +122,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -140,12 +140,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_common.yml
+++ b/.github/workflows/benchmark_gpu_common.yml
@@ -192,11 +192,11 @@ jobs:
      matrix:
        # explicit include-based build matrix, of known valid options
        include:
-          - cuda: "12.2"
+          - cuda: "12.8"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -224,13 +224,13 @@ jobs:
        params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
        # explicit include-based build matrix, of known valid options
        include:
-          - cuda: "12.2"
+          - cuda: "12.8"
            gcc: 11
    env:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -269,7 +269,7 @@ jobs:
          GCC_VERSION: ${{ matrix.gcc }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -312,7 +312,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -100,11 +100,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -129,7 +129,7 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -160,7 +160,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -101,11 +101,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -130,7 +130,7 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -161,7 +161,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_hpu_integer.yml
+++ b/.github/workflows/benchmark_hpu_integer.yml
@@ -29,7 +29,7 @@ jobs:
          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -48,12 +48,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -120,7 +120,7 @@ jobs:
        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -138,12 +138,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -81,7 +81,7 @@ jobs:
        op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -99,12 +99,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -120,7 +120,7 @@ jobs:
        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -138,12 +138,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -53,7 +53,7 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -100,7 +100,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -53,7 +53,7 @@ jobs:
    runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -100,7 +100,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -42,7 +42,7 @@ jobs:
      zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -90,7 +90,7 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -108,12 +108,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
@@ -148,7 +148,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -36,7 +36,7 @@ jobs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -88,7 +88,7 @@ jobs:
        browser: [ chrome, firefox ]
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -106,7 +106,7 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

@@ -116,7 +116,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        with:
          path: |
            ~/.nvm
@@ -129,7 +129,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 #v4.2.3
+        uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -185,7 +185,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -43,7 +43,7 @@ jobs:
      zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -132,7 +132,7 @@ jobs:
        bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
    steps:
      - name: Checkout tfhe-rs repo with tags
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -150,12 +150,12 @@ jobs:
          SHA: ${{ github.sha }}

      - name: Install rust
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: nightly

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
@@ -198,7 +198,7 @@ jobs:
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -29,13 +29,13 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -49,14 +49,6 @@ jobs:
          mv linelint-linux-amd64 /usr/local/bin/linelint
          make check_newline

-      # This is needed for the ws tests clippy checks
-      - name: Use specific data branch
-        if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
-        env:
-          PR_BRANCH: ${{ github.head_ref || github.ref_name }}
-        run: |
-          echo "BACKWARD_COMPAT_DATA_BRANCH=${PR_BRANCH}" >> "${GITHUB_ENV}"
-
      - name: Run pcc checks
        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
@@ -67,6 +59,11 @@ jobs:
        run: |
          make build_tfhe_csprng

+      - name: Build with MSRV
+        if: ${{ contains(matrix.os, 'ubuntu') }}
+        run: |
+          make build_tfhe_msrv
+
      - name: Build Release core
        if: ${{ contains(matrix.os, 'ubuntu') }}
        run: |
--- a/.github/workflows/cargo_build_tfhe_fft.yml
+++ b/.github/workflows/cargo_build_tfhe_fft.yml
@@ -25,7 +25,7 @@ jobs:
      fail-fast: false

    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_build_tfhe_ntt.yml
+++ b/.github/workflows/cargo_build_tfhe_ntt.yml
@@ -23,7 +23,7 @@ jobs:
        os: [ubuntu-latest, macos-latest, windows-latest]
      fail-fast: false
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_fft.yml
+++ b/.github/workflows/cargo_test_fft.yml
@@ -28,7 +28,7 @@ jobs:
      fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -54,7 +54,7 @@ jobs:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -84,7 +84,7 @@ jobs:
      matrix:
        runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -108,7 +108,7 @@ jobs:
    if: needs.should-run.outputs.fft_test == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/cargo_test_ntt.yml
+++ b/.github/workflows/cargo_test_ntt.yml
@@ -28,7 +28,7 @@ jobs:
      ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -54,7 +54,7 @@ jobs:
        os: [ ubuntu-latest, macos-latest, windows-latest ]
      fail-fast: false
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
      matrix:
        os: [ ubuntu-latest, macos-latest, windows-latest ]
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -18,7 +18,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -48,13 +48,13 @@ jobs:
    timeout-minutes: 5760 # 4 days
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -90,7 +90,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
+        uses: codecov/codecov-action@fdcc8476540edceab3de004e990f80d881c6cc00
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -104,7 +104,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
+        uses: codecov/codecov-action@fdcc8476540edceab3de004e990f80d881c6cc00
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/coprocessor-benchmark-gpu.yml
+++ b/.github/workflows/coprocessor-benchmark-gpu.yml
@@ -0,0 +1,291 @@
+# Run all fhevm coprocessor benchmarks on a GPU instance on Hyperstack and return parsed results to Slab CI bot.
+name: Cuda Coprocessor benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly tests @ 1AM
+    - cron: "0 1 * * 6"
+
+permissions:
+  contents: read
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  PROFILE: "multi-h100-sxm5 (n3-H100x8-SXM5)"
+  BENCHMARK_TYPE: "ALL"
+  OPTIMIZATION_TARGET: "throughput"
+  BATCH_SIZE: "5000"
+  SCHEDULING_POLICY: "MAX_PARALLELISM"
+  BENCHMARKS: "erc20"
+  BRANCH_NAME: ${{ github.ref_name }}
+  COMMIT_SHA: ${{ github.sha }}
+  SLAB_SECRET: ${{ secrets.JOB_SECRET }}
+
+jobs:
+  parse-inputs:
+    name: coprocessor-benchmark-gpu/parse-inputs
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    steps:
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          # shellcheck disable=SC2001
+          PROFILE_VAL=$(echo "${PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
+          echo "profile=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          # shellcheck disable=SC2001
+          PROFILE_VAL=$(echo "${PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
+          echo "name=$PROFILE_VAL" >> "${GITHUB_OUTPUT}"
+
+  setup-instance:
+    name: coprocessor-benchmark-gpu/setup-instance
+    needs: parse-inputs
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: ${{ needs.parse-inputs.outputs.profile }}
+
+  benchmark:
+    name: coprocessor-benchmark-gpu/benchmark-gpu (bpr)
+    needs: [ parse-inputs, setup-instance ]
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    continue-on-error: true
+    timeout-minutes: 720  # 12 hours
+    permissions:
+      contents: 'read'
+      packages: 'read'
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.8"
+            gcc: 11
+    env:
+      HW_NAME: "${{ needs.parse-inputs.outputs.hardware_name }}"
+
+    steps:
+      - name: Install git LFS
+        run: |
+          sudo apt-get remove -y unattended-upgrades
+          sudo apt-get update
+          sudo apt-get install -y git-lfs protobuf-compiler
+          git lfs install
+
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          path: tfhe-rs
+          persist-credentials: false
+
+      - name: Check fhEVM and TFHE-rs repos
+        run: |
+          pwd
+          ls
+
+      - name: Checkout fhevm
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          repository: zama-ai/fhevm
+          persist-credentials: 'false'
+          fetch-depth: 0
+          lfs: true
+          ref: antoniu/use-tfhe-main-benches
+          path: fhevm
+
+      - name: Get benchmark details
+        run: |
+          COMMIT_DATE_ENV=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${COMMIT_SHA}")
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$COMMIT_DATE_ENV";
+            echo "COMMIT_HASH=$(git rev-parse HEAD)";
+          } >> "${GITHUB_ENV}"
+        working-directory: tfhe-rs/
+
+      - name: Check fhEVM and TFHE-rs repos
+        run: |
+          pwd
+          ls
+          mv tfhe-rs fhevm/coprocessor/
+
+      - name: Checkout LFS objects
+        run: git lfs checkout
+        working-directory: fhevm/
+
+      - name: Setup Hyperstack dependencies
+        uses: ./fhevm/.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: nightly
+
+      - name: Install cargo dependencies
+        run: |
+          sudo apt-get install -y protobuf-compiler cmake pkg-config libssl-dev \
+                                  libclang-dev docker-compose-v2 docker.io acl
+          sudo usermod -aG docker "$USER"
+          newgrp docker
+          sudo setfacl --modify user:"$USER":rw /var/run/docker.sock
+          cargo install sqlx-cli
+
+      - name: Install foundry
+        uses: foundry-rs/foundry-toolchain@de808b1eea699e761c404bda44ba8f21aba30b2c
+
+      - name: Cache cargo
+        uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: ${{ runner.os }}-cargo-
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Init database
+        run: make init_db
+        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+
+      - name: Use Node.js
+        uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.2
+        with:
+          node-version: 20.x
+
+      - name: Build contracts
+        env:
+          HARDHAT_NETWORK: hardhat
+        run: |
+          ls
+          pwd
+          cp ./host-contracts/.env.example ./host-contracts/.env
+          npm --prefix ./host-contracts ci --include=optional
+          cd host-contracts && npm install && npm run deploy:emptyProxies && npx hardhat compile
+        working-directory: fhevm/
+
+      - name: Profile erc20 no-cmux benchmark on GPU
+        run: |
+          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "profile_erc20_gpu"
+        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+
+      - name: Get nsys profile name
+        id: nsys_profile_name
+        run: echo "profile=coprocessor_profile_$(date +"%Y-%m-%d-%Hh").nsys-rep" >> "$GITHUB_OUTPUT"
+
+      - name: Timestamp nsys profile # zizmor: ignore[template-injection]
+        env:
+          REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
+        run: |
+          mv report1.nsys-rep ${{ env.REPORT_NAME }}
+        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+
+      - name: Upload profile artifact
+        env:
+          REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ env.REPORT_NAME }}
+          path: fhevm/coprocessor/fhevm-engine/coprocessor/${{ env.REPORT_NAME }}
+
+      - name: Run latency benchmark on GPU
+        run: |
+          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="LATENCY" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
+        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+
+      - name: Run throughput benchmarks on GPU
+        run: |
+          BENCHMARK_BATCH_SIZE="${BATCH_SIZE}" FHEVM_DF_SCHEDULE="${SCHEDULING_POLICY}" BENCHMARK_TYPE="THROUGHPUT_200" OPTIMIZATION_TARGET="${OPTIMIZATION_TARGET}" make -e "benchmark_${BENCHMARKS}_gpu"
+        working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py coprocessor/fhevm-engine/target/criterion "${RESULTS_FILENAME}" \
+          --database coprocessor \
+          --hardware "${HW_NAME}" \
+          --backend gpu \
+          --project-version "${COMMIT_HASH}" \
+          --branch "${BRANCH_NAME}" \
+          --commit-date "${COMMIT_DATE}" \
+          --bench-date "${BENCH_DATE}" \
+          --walk-subdirs \
+          --crate "coprocessor/fhevm-engine/coprocessor" \
+          --name-suffix "operation_batch_size_${BATCH_SIZE}-schedule_${SCHEDULING_POLICY}-optimization_target_${OPTIMIZATION_TARGET}"
+        working-directory: fhevm/
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${COMMIT_SHA}_${BENCHMARKS}_${{ needs.parse-inputs.outputs.profile }}
+          path: fhevm/$${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        env:
+          SLAB_URL: ${{ secrets.SLAB_URL }}
+        run: |
+          python3 slab/scripts/data_sender.py fhevm/"${RESULTS_FILENAME}" "${SLAB_SECRET}" \
+          --slab-url "${SLAB_URL}"
+
+  teardown-instance:
+    name: coprocessor-benchmark-gpu/teardown
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, benchmark ]
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -60,13 +60,13 @@ jobs:
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -39,13 +39,13 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_code_validation_tests.yml
+++ b/.github/workflows/gpu_code_validation_tests.yml
@@ -0,0 +1,152 @@
+# Compile and test tfhe-cuda-backend on an AWS instance
+name: Cuda - CPU Memory Checks
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  workflow_dispatch:
+  schedule:
+    # every 3 months
+    - cron: "0 0 1 */3 *"
+
+permissions:
+  contents: read
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-tests)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'pull_request' ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved')
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: gpu-test
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cuda-tests-linux:
+    name: CUDA Memory Checks tests
+    needs: [ setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 5760
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.8"
+            gcc: 11 
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Find tools
+        run: |
+          sudo apt update && sudo apt install -y valgrind 
+          find /usr -executable -name "compute-sanitizer"
+          which valgrind
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
+      - name: Run memory sanitizer
+        run: |
+          make test_high_level_api_gpu_valgrind
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Set pull-request URL
+        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Send message
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: Teardown instance (cuda-tests)
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -37,7 +37,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -122,11 +122,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -140,10 +140,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run core crypto and internal CUDA backend tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -36,7 +36,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -107,11 +107,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -124,10 +124,14 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
+
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -62,11 +62,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -79,10 +79,12 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -37,7 +37,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -109,11 +109,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -126,10 +126,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run multi-bit CUDA integer compression tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -58,12 +58,12 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -75,10 +75,12 @@ jobs:
          gcc-version: ${{ matrix.gcc }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run tests
        run: |
          if [[ "${IS_PR}" == "true" ]]; then
--- a/.github/workflows/gpu_memory_sanitizer.yml
+++ b/.github/workflows/gpu_memory_sanitizer.yml
@@ -0,0 +1,149 @@
+# Compile and test tfhe-cuda-backend on an AWS instance
+name: Cuda - GPU Memory Checks
+
+env:
+  CARGO_TERM_COLOR: always
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUSTFLAGS: "-C target-cpu=native"
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  SLACKIFY_MARKDOWN: true
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  PULL_REQUEST_MD_LINK: ""
+  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
+  # Secrets will be available only to zama-ai organization members
+  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
+  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
+
+on:
+  # Allows you to run this workflow manually from the Actions tab as an alternative.
+  pull_request:
+    types: [ labeled ]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-tests)
+    runs-on: ubuntu-latest
+    if: github.event_name != 'pull_request' ||
+      (github.event.action == 'labeled' && github.event.label.name == 'approved')
+    outputs:
+      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: hyperstack
+          profile: gpu-test
+
+      # This instance will be spawned especially for pull-request from forked repository
+      - name: Start GitHub instance
+        id: start-github-instance
+        if: env.SECRETS_AVAILABLE == 'false'
+        run: |
+          echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
+
+  cuda-tests-linux:
+    name: CUDA Memory Checks tests
+    needs: [ setup-instance ]
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 240
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.8"
+            gcc: 11 
+    steps:
+      - name: Checkout tfhe-rs
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
+        with:
+          persist-credentials: 'false'
+          token: ${{ env.CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Find tools
+        run: |
+          find /usr -executable -name "compute-sanitizer"
+
+      - name: Install latest stable
+        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        with:
+          toolchain: stable
+
+      - name: Run memory sanitizer
+        run: |
+          make test_high_level_api_gpu_sanitizer
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Set pull-request URL
+        if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
+        run: |
+          echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), "  >> "${GITHUB_ENV}"
+        env:
+          PR_BASE_URL: ${{ vars.PR_BASE_URL }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+
+      - name: Send message
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
+          SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
+
+  teardown-instance:
+    name: Teardown instance (cuda-tests)
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, cuda-tests-linux ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop remote instance
+        id: stop-instance
+        if: env.SECRETS_AVAILABLE == 'true'
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -72,7 +72,7 @@ jobs:

    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -94,7 +94,7 @@ jobs:
          CUDA_VERSION: ${{ matrix.cuda }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -37,7 +37,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -109,11 +109,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -126,10 +126,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run signed integer tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -37,7 +37,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -122,11 +122,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -140,10 +140,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run signed integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -38,7 +38,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -110,11 +110,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -127,10 +127,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Should run nightly tests
        if: github.event_name == 'schedule'
        run: |
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -37,7 +37,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -109,11 +109,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -126,10 +126,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run unsigned integer tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -37,7 +37,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -122,11 +122,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11 
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -140,10 +140,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Run unsigned integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -38,7 +38,7 @@ jobs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -110,11 +110,11 @@ jobs:
      matrix:
        include:
          - os: ubuntu-22.04
-            cuda: "12.2"
+            cuda: "12.8"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
@@ -127,10 +127,12 @@ jobs:
          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable
-
+      - name: Enable nvidia multi-process service
+        run: |
+          nvidia-cuda-mps-control -d
      - name: Should run nightly tests
        if: github.event_name == 'schedule'
        run: |
--- a/.github/workflows/hpu_hlapi_tests.yml
+++ b/.github/workflows/hpu_hlapi_tests.yml
@@ -28,7 +28,7 @@ jobs:
      hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -50,7 +50,7 @@ jobs:
    if: needs.should-run.outputs.hpu_test == 'true'
    runs-on: large_ubuntu_16
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ env.CHECKOUT_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -51,13 +51,13 @@ jobs:
    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -40,13 +40,13 @@ jobs:
    timeout-minutes: 720

    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: "false"
          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -49,7 +49,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -87,10 +87,10 @@ jobs:
    # For provenance of npmjs publish
    permissions:
      contents: read
-      id-token: write
+      id-token: write # also needed for OIDC token exchange on crates.io
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -100,20 +100,23 @@ jobs:
        run: |
          echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
      - name: Download artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
        with:
          name: crate
          path: target/package
+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
      - name: Publish crate.io package
        if: ${{ inputs.push_to_crates }}
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe ${DRY_RUN}

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -60,14 +60,14 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: "false"
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -122,6 +122,9 @@ jobs:
    name: Publish CUDA Release
    needs: [setup-instance, package] # for comparing hashes
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    permissions:
+      # Needed for OIDC token exchange on crates.io
+      id-token: write
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
@@ -134,7 +137,7 @@ jobs:
      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
+        uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
        with:
          toolchain: stable

@@ -163,15 +166,19 @@ jobs:
        env:
          GCC_VERSION: ${{ matrix.gcc }}

+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
+
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-cuda-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-cuda-backend ${DRY_RUN}

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_hpu.yml
+++ b/.github/workflows/make_release_hpu.yml
@@ -31,7 +31,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -66,23 +66,30 @@ jobs:
    name: Publish tfhe-hpu-backend Release
    runs-on: ubuntu-latest
    needs: [verify_tag, package] # for comparing hashes
+    permissions:
+      # Needed for OIDC token exchange on crates.io
+      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
+
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-hpu-backend ${DRY_RUN}

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -30,7 +30,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -67,27 +67,33 @@ jobs:
    name: Publish tfhe-csprng Release
    needs: [verify_tag, package]
    runs-on: ubuntu-latest
+    permissions:
+      # Needed for OIDC token exchange on crates.io
+      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
        with:
          name: crate-tfhe-csprng
          path: target/package
+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-csprng --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-csprng ${DRY_RUN}
      - name: Generate hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -32,7 +32,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -67,23 +67,30 @@ jobs:
    name: Publish tfhe-fft Release
    runs-on: ubuntu-latest
    needs: [verify_tag, package] # for comparing hashes
+    permissions:
+      # Needed for OIDC token exchange on crates.io
+      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
+
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-fft --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-fft ${DRY_RUN}

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -32,7 +32,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -67,23 +67,30 @@ jobs:
    name: Publish tfhe-ntt Release
    runs-on: ubuntu-latest
    needs: [verify_tag, package] # for comparing hashes
+    permissions:
+      # Needed for OIDC token exchange on crates.io
+      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
+
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-ntt --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-ntt ${DRY_RUN}

      - name: Generate hash
        id: published_hash
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -26,7 +26,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -60,23 +60,29 @@ jobs:
    name: Publish tfhe-versionable-derive Release
    needs: [ verify_tag, package-derive ] # for comparing hashes
    runs-on: ubuntu-latest
+    permissions:
+      # Needed for OIDC token exchange on crates.io
+      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
        with:
          name: crate-tfhe-versionable-derive
          path: target/package
+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
        run: |
-          cargo publish -p tfhe-versionable-derive --token "${CRATES_TOKEN}"
+          cargo publish -p tfhe-versionable-derive
      - name: Generate hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
@@ -103,7 +109,7 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
@@ -139,21 +145,24 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
        with:
          name: crate-tfhe-versionable
          path: target/package
+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
        run: |
-          cargo publish -p tfhe-versionable --token "${CRATES_TOKEN}"
+          cargo publish -p tfhe-versionable
      - name: Generate hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -24,7 +24,7 @@ jobs:
        hash: ${{ steps.hash.outputs.hash }}
      steps:
        - name: Checkout
-          uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+          uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
          with:
            fetch-depth: 0
            persist-credentials: 'false'
@@ -64,27 +64,33 @@ jobs:
    name: Publish tfhe-zk-pok Release
    needs: [verify_tag, package] # for comparing hashes
    runs-on: ubuntu-latest
+    permissions:
+      # Needed for OIDC token exchange on crates.io
+      id-token: write
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
-        uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
+        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
        with:
          name: crate-zk-pok
          path: target/package
+      - name: Authenticate on registry
+        uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
+        id: auth
      - name: Publish crate.io package
        env:
-          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
          DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
        run: |
          # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish 
          # would fail. This is safe since DRY_RUN is handled in the env section above.
          # shellcheck disable=SC2086
-          cargo publish -p tfhe-zk-pok --token "${CRATES_TOKEN}" ${DRY_RUN}
+          cargo publish -p tfhe-zk-pok ${DRY_RUN}
      - name: Verify hash
        id: published_hash
        run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
--- a/.github/workflows/parameters_check.yml
+++ b/.github/workflows/parameters_check.yml
@@ -19,17 +19,17 @@ jobs:
    runs-on: large_ubuntu_16-22.04
    steps:
      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Checkout lattice-estimator
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          repository: malb/lattice-estimator
          path: lattice_estimator
-          ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'
+          ref: '52f4b7a99ae7b5dfd088c5c295070bd38ff0d1e0'
          persist-credentials: 'false'

      - name: Install Sage
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
        with:
          fetch-depth: 0
          persist-credentials: 'false'
--- a/.gitignore
+++ b/.gitignore
@@ -36,9 +36,6 @@ package-lock.json
 .env
 __pycache__

-# Dir used for backward compatibility test data
-# First directive is to ignore symlinks
-tests/tfhe-backward-compat-data
 ci/

 # In case someone clones the lattice-estimator locally to verify security
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,7 +30,7 @@ itertools = "0.14"
 num-complex = "0.4"
 pulp = { version = "0.21", default-features = false }
 rand = "0.8"
-rayon = "1"
+rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = "0.2.100"

--- a/263
+++ b/263
@@ -23,7 +23,6 @@ BENCH_PARAM_TYPE?=classical
 BENCH_PARAMS_SET?=default
 NODE_VERSION=22.6
 BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
-TFHE_SPEC:=tfhe
 WASM_PACK_VERSION="0.13.1"
 # We are kind of hacking the cut here, the version cannot contain a quote '"'
 WASM_BINDGEN_VERSION:=$(shell grep '^wasm-bindgen[[:space:]]*=' Cargo.toml | cut -d '"' -f 2 | xargs)
@@ -97,6 +96,12 @@ install_rs_build_toolchain:
 	( echo "Unable to install $(RS_BUILD_TOOLCHAIN) toolchain, check your rustup installation. \
 	Rustup can be downloaded at https://rustup.rs/" && exit 1 )

+.PHONY: install_rs_msrv_toolchain # Install the msrv toolchain
+install_rs_msrv_toolchain:
+	@rustup toolchain install --profile default "$(MIN_RUST_VERSION)" || \
+	( echo "Unable to install $(MIN_RUST_VERSION) toolchain, check your rustup installation. \
+	Rustup can be downloaded at https://rustup.rs/" && exit 1 )
+
 .PHONY: install_build_wasm32_target # Install the wasm32 toolchain used for builds
 install_build_wasm32_target: install_rs_build_toolchain
 	rustup +$(RS_BUILD_TOOLCHAIN) target add wasm32-unknown-unknown || \
@@ -303,28 +308,28 @@ clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
 		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
-		-p $(TFHE_SPEC)
+		-p tfhe

 .PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
 clippy_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
 		--all-targets \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
 clippy_gpu_hpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
 		--all-targets \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
 fix_newline: check_linelint_installed
@@ -345,55 +350,55 @@ check_workflow_security: install_zizmor
 .PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=nightly-avx512 \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=experimental,nightly-avx512 \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=zk-pok \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_shortint # Run clippy lints enabling the shortint features
 clippy_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=shortint \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=shortint,experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=zk-pok,shortint \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_integer # Run clippy lints enabling the integer features
 clippy_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=integer \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=integer,experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=integer,experimental,extended-types \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
 clippy: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_rustdoc # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
 clippy_rustdoc: install_rs_check_toolchain
@@ -404,7 +409,7 @@ clippy_rustdoc: install_rs_check_toolchain
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
 		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental \
-		-p $(TFHE_SPEC)
+		-p tfhe

 .PHONY: clippy_rustdoc_gpu # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
 clippy_rustdoc_gpu: install_rs_check_toolchain
@@ -415,22 +420,22 @@ clippy_rustdoc_gpu: install_rs_check_toolchain
 	CARGO_TERM_QUIET=true CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
 		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
-		-p $(TFHE_SPEC)
+		-p tfhe

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
 clippy_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-c-api,shortint-c-api,high-level-c-api,extended-types \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
 clippy_js_wasm_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok,extended-types \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
 		--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,extended-types \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_tasks # Run clippy lints on helper tasks crate.
 clippy_tasks: install_rs_check_toolchain
@@ -451,15 +456,17 @@ clippy_ws_tests: install_rs_check_toolchain
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,extended-types \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,extended-types,experimental \
-		-p $(TFHE_SPEC) -- --no-deps -D warnings
+		-p tfhe -- --no-deps -D warnings

 .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
 clippy_tfhe_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
 		--features=parallel,software-prng -p tfhe-csprng -- --no-deps -D warnings
+	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
+		--features=parallel -p tfhe-csprng -- --no-deps -D warnings

 .PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
 clippy_zk_pok: install_rs_check_toolchain
@@ -542,63 +549,72 @@ tfhe_lints: install_cargo_dylint
 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		-p $(TFHE_SPEC)
+		-p tfhe
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=nightly-avx512 -p $(TFHE_SPEC); \
+			--features=nightly-avx512 -p tfhe; \
 	fi

 .PHONY: build_core_experimental # Build core_crypto with experimental features
 build_core_experimental: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=experimental -p $(TFHE_SPEC)
+		--features=experimental -p tfhe
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=experimental,nightly-avx512 -p $(TFHE_SPEC); \
+			--features=experimental,nightly-avx512 -p tfhe; \
 	fi

 .PHONY: build_boolean # Build with boolean enabled
 build_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean -p $(TFHE_SPEC) --all-targets
+		--features=boolean -p tfhe --all-targets

 .PHONY: build_shortint # Build with shortint enabled
 build_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=shortint -p $(TFHE_SPEC) --all-targets
+		--features=shortint -p tfhe --all-targets

 .PHONY: build_integer # Build with integer enabled
 build_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=integer -p $(TFHE_SPEC) --all-targets
+		--features=integer -p tfhe --all-targets

 .PHONY: build_tfhe_full # Build with boolean, shortint and integer enabled
 build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
+		--features=boolean,shortint,integer -p tfhe --all-targets

 .PHONY: build_tfhe_coverage # Build with test coverage enabled
 build_tfhe_coverage: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
+		--features=boolean,shortint,integer,internal-keycache -p tfhe --tests
+
+# As of 05/08/2025 this is the set of features that can be easily compiled without additional
+# toolkits
+.PHONY: build_tfhe_msrv # Build with msrv compiler
+build_tfhe_msrv: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo +$(MIN_RUST_VERSION) build --profile dev \
+		--features=boolean,extended-types,hpu,hpu-debug \
+		--features=hpu-v80,integer,noise-asserts \
+		--features=pbs-stats,shortint,strings,zk-pok -p tfhe

 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types \
-		-p $(TFHE_SPEC)
+		-p tfhe

 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,extended-types,gpu \
-		-p $(TFHE_SPEC)
+		-p tfhe

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
 		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
-		-p $(TFHE_SPEC)
+		-p tfhe

 .PHONY: build_web_js_api # Build the js API targeting the web browser
 build_web_js_api: install_rs_build_toolchain install_wasm_pack
@@ -633,10 +649,10 @@ build_tfhe_csprng: install_rs_build_toolchain
 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
+		--features=experimental,zk-pok -p tfhe -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=experimental,zk-pok,nightly-avx512 -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=experimental,zk-pok,nightly-avx512 -p tfhe -- core_crypto::; \
 	fi

 .PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -645,13 +661,13 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 		--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
 		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
 		--features=experimental,internal-keycache \
-		-p $(TFHE_SPEC) -- core_crypto::
+		-p tfhe -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
 			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
 			--features=experimental,internal-keycache,nightly-avx512 \
-			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
+			-p tfhe -- -Z unstable-options --report-time core_crypto::; \
 	fi

 .PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
@@ -668,23 +684,47 @@ test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=gpu -p tfhe -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=gpu -p tfhe -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=4
+		--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
+		--features=integer,gpu -p tfhe -- integer::gpu::server_key::

 .PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
 test_integer_gpu_debug: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile release_lto_off \
-		--features=integer,gpu-debug -vv -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=1 --nocapture
+		--features=integer,gpu-debug -vv -p tfhe -- integer::gpu::server_key:: --test-threads=1 --nocapture
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile release_lto_off \
-		--features=integer,gpu-debug -p $(TFHE_SPEC) -- integer::gpu::server_key::
+		--features=integer,gpu-debug -p tfhe -- integer::gpu::server_key::
+
+.PHONY: test_high_level_api_gpu_valgrind # Run the tests of the integer module with Debug flags for CUDA
+test_high_level_api_gpu_valgrind: install_rs_build_toolchain install_cargo_nextest
+	export RUSTFLAGS="-C target-cpu=x86-64" && \
+	export CARGO_RS_BUILD_TOOLCHAIN="$(CARGO_RS_BUILD_TOOLCHAIN)" && \
+	export TFHE_SPEC="tfhe" && \
+	export CARGO_PROFILE="$(CARGO_PROFILE)" &&	scripts/check_memory_errors.sh --cpu
+
+.PHONY: test_high_level_api_gpu_sanitizer # Run the tests of the integer module with Debug flags for CUDA
+test_high_level_api_gpu_sanitizer: install_rs_build_toolchain install_cargo_nextest
+	export RUSTFLAGS="-C target-cpu=x86-64" && \
+	export CARGO_RS_BUILD_TOOLCHAIN="$(CARGO_RS_BUILD_TOOLCHAIN)" && \
+	export TFHE_SPEC="tfhe" && \
+	export CARGO_PROFILE="$(CARGO_PROFILE)" &&	scripts/check_memory_errors.sh --gpu
+
+.PHONY: test_integer_hl_test_gpu_check_warnings
+test_integer_hl_test_gpu_check_warnings: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build \
+		--features=integer,internal-keycache,gpu-debug,zk-pok -vv -p tfhe &> /tmp/gpu_compile_output
+	WARNINGS=$$(cat /tmp/gpu_compile_output | grep ": warning #" | grep "\[tfhe-cuda-backend" | grep -v "inline qualifier" || true) && \
+	if [[ "$${WARNINGS}" != "" ]]; then \
+	    echo "FAILING BECAUSE CUDA COMPILATION WARNINGS WERE DETECTED: " && \
+		echo "$${WARNINGS}" && exit 1; \
+	fi


 .PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
@@ -693,27 +733,27 @@ test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
 	LONG_TESTS=TRUE \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)" --backend "gpu"
+		--tfhe-package "tfhe" --backend "gpu"

 .PHONY: test_integer_short_run_gpu # Run the long run integer tests on the gpu backend
 test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
 	TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE \
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture
+		--features=integer,gpu -p tfhe -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture

 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer -p $(TFHE_SPEC) -- integer::ciphertext::compressed_ciphertext_list::tests::
+		--features=integer -p tfhe -- integer::ciphertext::compressed_ciphertext_list::tests::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=integer -p $(TFHE_SPEC) -- integer::ciphertext::compress
+		--features=integer -p tfhe -- integer::ciphertext::compress

 .PHONY: test_integer_compression_gpu
 test_integer_compression_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
+		--features=integer,gpu -p tfhe -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
+		--features=integer,gpu -p tfhe -- integer::gpu::ciphertext::compress

 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -722,7 +762,7 @@ test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
-		--tfhe-package "$(TFHE_SPEC)"
+		--tfhe-package "tfhe"

 .PHONY: test_unsigned_integer_gpu_ci # Run the tests for unsigned integer ci on gpu backend
 test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -731,7 +771,7 @@ test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
-		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
+		--unsigned-only --tfhe-package "tfhe"

 .PHONY: test_signed_integer_gpu_ci # Run the tests for signed integer ci on gpu backend
 test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -740,7 +780,7 @@ test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
-		--signed-only --tfhe-package "$(TFHE_SPEC)"
+		--signed-only --tfhe-package "tfhe"

 .PHONY: test_integer_multi_bit_gpu_ci # Run the tests for integer ci on gpu backend running only multibit tests
 test_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -749,7 +789,7 @@ test_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
-		--tfhe-package "$(TFHE_SPEC)"
+		--tfhe-package "tfhe"

 .PHONY: test_unsigned_integer_multi_bit_gpu_ci # Run the tests for unsigned integer ci on gpu backend running only multibit tests
 test_unsigned_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -758,7 +798,7 @@ test_unsigned_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
-		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
+		--unsigned-only --tfhe-package "tfhe"

 .PHONY: test_signed_integer_multi_bit_gpu_ci # Run the tests for signed integer ci on gpu backend running only multibit tests
 test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -767,34 +807,34 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
-		--signed-only --tfhe-package "$(TFHE_SPEC)"
+		--signed-only --tfhe-package "tfhe"

 .PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
 test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
-	cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
+	cargo test --release -p tfhe --features hpu-v80 --test hpu

 .PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
 test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
 	source ./setup_hpu.sh --config sim ; \
 	cargo build --release --bin hpu_mockup; \
-    coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
+	coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
 	HPU_TEST_ITER=1 \
-	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	cargo test --profile devo -p tfhe --features hpu --test hpu -- u32 && \
 	kill %1

 .PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
 test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
 	source ./setup_hpu.sh --config sim ; \
 	cargo build --profile devo --bin hpu_mockup; \
-    coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
+	coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
 	HPU_TEST_ITER=1 \
-	cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
+	cargo test --profile devo -p tfhe --features hpu --test hpu -- u32 && \
 	kill %1

 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean -p $(TFHE_SPEC) -- boolean::
+		--features=boolean -p tfhe -- boolean::

 .PHONY: test_boolean_cov # Run the tests of the boolean module with code coverage
 test_boolean_cov: install_rs_check_toolchain install_tarpaulin
@@ -802,13 +842,13 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 		--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
 		--features=boolean,internal-keycache \
-		-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::
+		-p tfhe -- -Z unstable-options --report-time boolean::

 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--features=boolean-c-api,shortint-c-api,high-level-c-api \
-		-p $(TFHE_SPEC) \
+		-p tfhe \
 		c_api

 .PHONY: test_c_api_c # Run the C tests for the C API
@@ -827,19 +867,19 @@ test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "$(TFHE_SPEC)"
+		--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "tfhe"

 .PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
 test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
 	FAST_TESTS="$(FAST_TESTS)" \
 		./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "$(TFHE_SPEC)"
+		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "tfhe"

 .PHONY: test_shortint # Run all the tests for shortint
 test_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=shortint,internal-keycache -p $(TFHE_SPEC) -- shortint::
+		--features=shortint,internal-keycache -p tfhe -- shortint::

 .PHONY: test_shortint_cov # Run the tests of the shortint module with code coverage
 test_shortint_cov: install_rs_check_toolchain install_tarpaulin
@@ -847,7 +887,7 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 		--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
 		--features=shortint,internal-keycache \
-		-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::
+		-p tfhe -- -Z unstable-options --report-time shortint::

 .PHONY: test_integer_ci # Run the tests for integer ci
 test_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -856,7 +896,7 @@ test_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)"
+		--tfhe-package "tfhe"

 .PHONY: test_unsigned_integer_ci # Run the tests for unsigned integer ci
 test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -865,7 +905,7 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
+		--unsigned-only --tfhe-package "tfhe"

 .PHONY: test_signed_integer_ci # Run the tests for signed integer ci
 test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -874,7 +914,7 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only --tfhe-package "$(TFHE_SPEC)"
+		--signed-only --tfhe-package "tfhe"

 .PHONY: test_integer_multi_bit_ci # Run the tests for integer ci running only multibit tests
 test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -883,7 +923,7 @@ test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)"
+		--tfhe-package "tfhe"

 .PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for unsigned integer ci running only multibit tests
 test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -892,7 +932,7 @@ test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nex
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--unsigned-only --tfhe-package "$(TFHE_SPEC)"
+		--unsigned-only --tfhe-package "tfhe"

 .PHONY: test_signed_integer_multi_bit_ci # Run the tests for signed integer ci running only multibit tests
 test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -901,7 +941,7 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
-		--signed-only --tfhe-package "$(TFHE_SPEC)"
+		--signed-only --tfhe-package "tfhe"

 .PHONY: test_integer_long_run # Run the long run integer tests
 test_integer_long_run: install_rs_check_toolchain install_cargo_nextest
@@ -909,22 +949,31 @@ test_integer_long_run: install_rs_check_toolchain install_cargo_nextest
 	LONG_TESTS=TRUE \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)"
+		--tfhe-package "tfhe"
+
+.PHONY: test_noise_check # Run dedicated noise and pfail check tests
+test_noise_check: install_rs_check_toolchain
+	@# First run the sanity checks to make sure the atomic patterns are correct
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=boolean,shortint,integer,nightly-avx512 -p tfhe -- sanity_check
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=boolean,shortint,integer,nightly-avx512 -p tfhe -- noise_check \
+		--test-threads=1 --nocapture

 .PHONY: test_safe_serialization # Run the tests for safe serialization
 test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_serialization::
+		--features=boolean,shortint,integer,internal-keycache -p tfhe -- safe_serialization::

 .PHONY: test_zk # Run the tests for the zk module of the TFHE-rs crate
 test_zk: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=shortint,zk-pok -p $(TFHE_SPEC) -- zk::
+		--features=shortint,zk-pok -p tfhe -- zk::

 .PHONY: test_integer # Run all the tests for integer
 test_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,internal-keycache -p $(TFHE_SPEC) -- integer::
+		--features=integer,internal-keycache -p tfhe -- integer::

 .PHONY: test_integer_cov # Run the tests of the integer module with code coverage
 test_integer_cov: install_rs_check_toolchain install_tarpaulin
@@ -933,17 +982,17 @@ test_integer_cov: install_rs_check_toolchain install_tarpaulin
 		--implicit-test-threads \
 		--exclude-files $(COVERAGE_EXCLUDED_FILES) \
 		--features=integer,internal-keycache \
-		-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
+		-p tfhe -- -Z unstable-options --report-time integer::

 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p $(TFHE_SPEC) \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
 		-- high_level_api::

 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
+		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
 		-E "test(/high_level_api::.*gpu.*/)"

 test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
@@ -951,13 +1000,13 @@ ifeq ($(HPU_CONFIG), v80)
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
 		--build-jobs=$(CARGO_BUILD_JOBS) \
 		--test-threads=1 \
-		--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe \
 		-E "test(/high_level_api::.*hpu.*/)"
 else
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
 		--build-jobs=$(CARGO_BUILD_JOBS) \
 		--test-threads=1 \
-		--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
+		--features=integer,internal-keycache,hpu -p tfhe \
 		-E "test(/high_level_api::.*hpu.*/)"
 endif

@@ -965,7 +1014,7 @@ endif
 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=shortint,integer,strings -p $(TFHE_SPEC) \
+		--features=shortint,integer,strings -p tfhe \
 		-- strings::


@@ -973,24 +1022,24 @@ test_strings: install_rs_build_toolchain
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
 		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok,strings \
-		-p $(TFHE_SPEC) \
+		-p tfhe \
 		-- test_user_docs::

 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=internal-keycache,integer,zk-pok,gpu -p $(TFHE_SPEC) \
+		--features=internal-keycache,integer,zk-pok,gpu -p tfhe \
 		-- test_user_docs::

 .PHONY: test_user_doc_hpu # Run tests for HPU from the .md documentation
 test_user_doc_hpu: install_rs_build_toolchain
 ifeq ($(HPU_CONFIG), v80)
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=internal-keycache,integer,hpu,hpu-v80 -p $(TFHE_SPEC) \
+		--features=internal-keycache,integer,hpu,hpu-v80 -p tfhe \
 		-- test_user_docs::
 else
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=internal-keycache,integer,hpu -p $(TFHE_SPEC) \
+		--features=internal-keycache,integer,hpu -p tfhe \
 		-- test_user_docs::
 endif

@@ -1075,7 +1124,7 @@ doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)
+		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok --no-deps -p tfhe

 .PHONY: docs # Build rust doc alias for doc
 docs: doc
@@ -1086,7 +1135,7 @@ lint_doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps
+		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok -p tfhe --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -1124,7 +1173,7 @@ check_parameter_export_ok:
 check_compile_tests: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
 		--features=experimental,boolean,shortint,integer,internal-keycache \
-		-p $(TFHE_SPEC)
+		-p tfhe

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
 		"$(MAKE)" build_c_api && \
@@ -1135,7 +1184,7 @@ check_compile_tests: install_rs_build_toolchain
 check_compile_tests_benches_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
 		--features=experimental,boolean,shortint,integer,internal-keycache,gpu,zk-pok \
-		-p $(TFHE_SPEC)
+		-p tfhe
 	mkdir -p "$(TFHECUDA_BUILD)" && \
 		cd "$(TFHECUDA_BUILD)" && \
 		cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
@@ -1436,6 +1485,24 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 	nvm use $(NODE_VERSION) && \
 	$(MAKE) bench_web_js_api_parallel_firefox

+.PHONY: bench_hlapi # Run benchmarks for integer operations
+bench_hlapi: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi \
+	--features=integer,internal-keycache,nightly-avx512 -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
+bench_hlapi_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi \
+	--features=integer,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark --
+
+.PHONY: bench_hlapi_hpu # Run benchmarks for integer operations on HPU
+bench_hlapi_hpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi \
+	--features=integer,hpu,hpu-v80,internal-keycache,nightly-avx512 -p tfhe-benchmark --
+
 .PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
 bench_hlapi_erc20: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
@@ -1495,13 +1562,13 @@ bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
 gen_key_cache: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 		--example generates_test_keys \
-		--features=boolean,shortint,experimental,internal-keycache -p $(TFHE_SPEC) \
+		--features=boolean,shortint,experimental,internal-keycache -p tfhe \
 		-- $(MULTI_BIT_ONLY) $(COVERAGE_ONLY)

 .PHONY: gen_key_cache_core_crypto # Run function to generate keys and cache them for core_crypto tests
 gen_key_cache_core_crypto: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --tests --profile $(CARGO_PROFILE) \
-		--features=experimental,internal-keycache -p $(TFHE_SPEC) -- --nocapture \
+		--features=experimental,internal-keycache -p tfhe -- --nocapture \
 		core_crypto::keycache::generate_keys

 .PHONY: measure_hlapi_compact_pk_ct_sizes # Measure sizes of public keys and ciphertext for high-level API
@@ -1582,7 +1649,7 @@ tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
-clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
+clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu test_integer_hl_test_gpu_check_warnings

 .PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
 pcc_hpu: clippy_hpu clippy_hpu_backend clippy_hpu_mockup test_integer_hpu_mockup_ci_fast
--- a/README.md
+++ b/README.md
@@ -204,7 +204,7 @@ When a new update is published in the Lattice Estimator, we update parameters ac
 By default, the parameter sets used in the High-Level API with the x86 CPU backend have a failure probability $\le 2^{128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
 If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).

-The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-64}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].
+The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-128}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].

 [1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf

--- a/_typos.toml
+++ b/_typos.toml
@@ -12,7 +12,7 @@ extend-ignore-identifiers-re = [
    "herlo",
    # Example in trivium
    "C9217BA0D762ACA1",
-    "0x[0-9a-fA-F]+"
+    "0x[0-9a-fA-F]+",
 ]

 [files]
@@ -20,4 +20,6 @@ extend-exclude = [
    "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
    "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
    "backends/tfhe-hpu-backend/config_store/**/*.link_summary",
+    "*.cbor",
+    "*.bcode",
 ]
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -138,9 +138,9 @@ Example code:
 ```rust
 use tfhe::shortint::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,9 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,16 +1,16 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
 // commit fd6828f68711276c25f55e605935028f5e843f43

 fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
-    assert!(a.len() % 8 == 0);
+    assert!(a.len().is_multiple_of(8));
    let mut hexadecimal: String = "".to_string();
    for test in a.chunks(8) {
        // Encoding is bytes in LSB order
@@ -63,7 +63,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
 }

 fn get_hexagonal_string_from_bytes(a: Vec<u8>) -> String {
-    assert!(a.len() % 8 == 0);
+    assert!(a.len().is_multiple_of(8));
    let mut hexadecimal: String = "".to_string();
    for test in a {
        hexadecimal.push_str(&format!("{test:02X?}"));
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,16 +1,16 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::current_params::{
-    V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
+    V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
 // file testvectors/trivium-80.80.test-vectors

 fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
-    assert!(a.len() % 8 == 0);
+    assert!(a.len().is_multiple_of(8));
    let mut hexadecimal: String = "".to_string();
    for test in a.chunks(8) {
        // Encoding is bytes in LSB order
@@ -63,7 +63,7 @@ fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
 }

 fn get_hexagonal_string_from_bytes(a: Vec<u8>) -> String {
-    assert!(a.len() % 8 == 0);
+    assert!(a.len().is_multiple_of(8));
    let mut hexadecimal: String = "".to_string();
    for test in a {
        hexadecimal.push_str(&format!("{test:02X?}"));
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V1_4_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V1_4_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V1_4_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -78,8 +78,10 @@ endif()

 add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

+string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWERCASE)
+
 # Check if the DEBUG flag is defined
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
  # Debug mode
  message("Compiling in Debug mode")
  add_definitions(-DDEBUG)
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -26,6 +26,15 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
                                    void *lwe_array_out, uint32_t size,
                                    uint32_t log_modulus);

+void cuda_modulus_switch_64(void *stream, uint32_t gpu_index, void *lwe_out,
+                            const void *lwe_in, uint32_t size,
+                            uint32_t log_modulus);
+
+void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
+                                     void *lwe_out, const void *lwe_in,
+                                     uint32_t lwe_dimension,
+                                     uint32_t log_modulus);
+
 void cuda_improve_noise_modulus_switch_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *lwe_array_indexes,
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -19,6 +19,11 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();
  }
 }
+
+// The PANIC macro should be used to validate user-inputs to GPU functions
+// it will execute in all targets, including production settings
+// e.g., cudaMemCopy to the device should check that the destination pointer is
+// a device pointer
 #define PANIC(format, ...)                                                     \
  {                                                                            \
    std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
@@ -26,6 +31,30 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

+// This is a generic assertion checking macro with user defined printf-style
+// message
+#define PANIC_IF_FALSE(cond, format, ...)                                      \
+  do {                                                                         \
+    if (!(cond)) {                                                             \
+      PANIC(format "\n\n %s\n", ##__VA_ARGS__, #cond);                         \
+    }                                                                          \
+  } while (0)
+
+#ifndef GPU_ASSERTS_DISABLE
+// The GPU assert should be used to validate assumptions in algorithms,
+// for example, checking that two user-provided quantities have a certain
+// relationship or that the size of the buffer  provided to a function is
+// sufficient when it is filled with some algorithm that depends on
+// user-provided inputs e.g., OPRF corrections buffer should not have a size
+// higher than the number of blocks in the datatype that is generated
+#define GPU_ASSERT(cond, format, ...)                                          \
+  PANIC_IF_FALSE(cond, format, ##__VA_ARGS__)
+#else
+#define GPU_ASSERT(cond)                                                       \
+  do {                                                                         \
+  } while (0)
+#endif
+
 uint32_t cuda_get_device();
 void cuda_set_device(uint32_t gpu_index);

--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -6,6 +6,7 @@

 extern std::mutex m;
 extern bool p2p_enabled;
+extern const int THRESHOLD_MULTI_GPU;

 extern "C" {
 int32_t cuda_setup_multi_gpu(int device_0_id);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -3,6 +3,26 @@

 #include "../../pbs/pbs_enums.h"

+typedef struct {
+  void *ptr;
+  uint32_t num_radix_blocks;
+  uint32_t lwe_dimension;
+} CudaLweCiphertextListFFI;
+
+typedef struct {
+  void *ptr;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+  // Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
+  // smaller)
+  // Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
+  // each LWE of the group). In the end the total number of bodies is equal to
+  // the number of input LWE
+  uint32_t total_lwe_bodies_count;
+  uint32_t glwe_dimension;
+  uint32_t polynomial_size;
+} CudaPackedGlweCiphertextListFFI;
+
 extern "C" {
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -10,28 +30,29 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
-    bool allocate_gpu_memory);
+    uint32_t lwe_per_glwe, bool allocate_gpu_memory);

 uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_blocks_to_decompress, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
-    uint32_t num_nths, int8_t *mem_ptr);
+    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
+    int8_t *mem_ptr);

 void cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
-    uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr);
+    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_in,
+    uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);

 void cleanup_cuda_integer_compress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -40,6 +61,41 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(
 void cleanup_cuda_integer_decompress_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
+    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
+    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t lwe_per_glwe, bool allocate_gpu_memory);
+
+uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
+    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    bool allocate_gpu_memory, bool allocate_ms_array);
+
+void cuda_integer_compress_radix_ciphertext_128(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
+    int8_t *mem_ptr);
+
+void cuda_integer_decompress_radix_ciphertext_128(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_in,
+    uint32_t const *indexes_array, int8_t *mem_ptr);
+
+void cleanup_cuda_integer_compress_radix_ciphertext_128(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+void cleanup_cuda_integer_decompress_radix_ciphertext_128(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -5,40 +5,33 @@

 template <typename Torus> struct int_compression {
  int_radix_params compression_params;
-  uint32_t storage_log_modulus;
-  uint32_t lwe_per_glwe;
-
-  uint32_t body_count;
-
  // Compression
  int8_t *fp_ks_buffer;
  Torus *tmp_lwe;
  Torus *tmp_glwe_array_out;
  bool gpu_memory_allocated;
+  uint32_t lwe_per_glwe;

  int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                  uint32_t gpu_count, int_radix_params compression_params,
                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
-                  uint32_t storage_log_modulus, bool allocate_gpu_memory,
-                  uint64_t &size_tracker) {
+                  bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->compression_params = compression_params;
-    this->lwe_per_glwe = lwe_per_glwe;
-    this->storage_log_modulus = storage_log_modulus;
-    this->body_count = num_radix_blocks;

-    Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
-                                  compression_params.polynomial_size;
+    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                     compression_params.polynomial_size;

-    tmp_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
+    tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
        num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
            sizeof(Torus),
-        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
-    tmp_glwe_array_out = (Torus *)cuda_malloc_with_size_tracking_async(
-        lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
-        gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory));
+    tmp_glwe_array_out =
+        static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
+            lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
+            gpu_indexes[0], size_tracker, allocate_gpu_memory));

-    size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    size_tracker += scratch_packing_keyswitch_lwe_list_to_glwe<Torus>(
        streams[0], gpu_indexes[0], &fp_ks_buffer,
        compression_params.small_lwe_dimension,
        compression_params.glwe_dimension, compression_params.polynomial_size,
@@ -58,11 +51,7 @@ template <typename Torus> struct int_compression {
 template <typename Torus> struct int_decompression {
  int_radix_params encryption_params;
  int_radix_params compression_params;
-
-  uint32_t storage_log_modulus;
-
-  uint32_t num_radix_blocks;
-  uint32_t body_count;
+  uint32_t num_blocks_to_decompress;

  Torus *tmp_extracted_glwe;
  Torus *tmp_extracted_lwe;
@@ -74,57 +63,61 @@ template <typename Torus> struct int_decompression {
  int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                    uint32_t gpu_count, int_radix_params encryption_params,
                    int_radix_params compression_params,
-                    uint32_t num_radix_blocks, uint32_t body_count,
-                    uint32_t storage_log_modulus, bool allocate_gpu_memory,
+                    uint32_t num_blocks_to_decompress, bool allocate_gpu_memory,
                    uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->encryption_params = encryption_params;
    this->compression_params = compression_params;
-    this->storage_log_modulus = storage_log_modulus;
-    this->num_radix_blocks = num_radix_blocks;
-    this->body_count = body_count;
+    this->num_blocks_to_decompress = num_blocks_to_decompress;

-    Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
-                                  compression_params.polynomial_size;
-    Torus lwe_accumulator_size = (compression_params.glwe_dimension *
-                                      compression_params.polynomial_size +
-                                  1);
-    decompression_rescale_lut = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, encryption_params, 1, num_radix_blocks,
-        allocate_gpu_memory, size_tracker);
+    uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                     compression_params.polynomial_size;
+    uint64_t lwe_accumulator_size = (compression_params.glwe_dimension *
+                                         compression_params.polynomial_size +
+                                     1);

    tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
-        gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
+        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
    tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0],
+        num_blocks_to_decompress * sizeof(uint32_t), streams[0], gpu_indexes[0],
        size_tracker, allocate_gpu_memory);
    tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
-        num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
-        gpu_indexes[0], size_tracker, allocate_gpu_memory);
+        num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
+        streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);

-    // Rescale is done using an identity LUT
-    // Here we do not divide by message_modulus
-    // Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
-    // space, we want to keep the original 2 bits value in the 4 bits space,
-    // so we apply the identity and the encoding will rescale it for us.
-    auto decompression_rescale_f = [](Torus x) -> Torus { return x; };
+    // rescale is only needed on 64-bit decompression
+    if constexpr (std::is_same_v<Torus, uint64_t>) {
+      decompression_rescale_lut = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, encryption_params, 1,
+          num_blocks_to_decompress, allocate_gpu_memory, size_tracker);

-    auto effective_compression_message_modulus =
-        encryption_params.carry_modulus;
-    auto effective_compression_carry_modulus = 1;
+      // Rescale is done using an identity LUT
+      // Here we do not divide by message_modulus
+      // Example: in the 2_2 case we are mapping a 2-bit message onto a 4-bit
+      // space, we want to keep the original 2-bit value in the 4-bit space,
+      // so we apply the identity and the encoding will rescale it for us.
+      decompression_rescale_lut = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, encryption_params, 1,
+          num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
+      auto decompression_rescale_f = [](Torus x) -> Torus { return x; };

-    generate_device_accumulator_with_encoding<Torus>(
-        streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
-        decompression_rescale_lut->get_degree(0),
-        decompression_rescale_lut->get_max_degree(0),
-        encryption_params.glwe_dimension, encryption_params.polynomial_size,
-        effective_compression_message_modulus,
-        effective_compression_carry_modulus, encryption_params.message_modulus,
-        encryption_params.carry_modulus, decompression_rescale_f,
-        gpu_memory_allocated);
+      auto effective_compression_message_modulus =
+          encryption_params.carry_modulus;
+      auto effective_compression_carry_modulus = 1;

-    decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
+      generate_device_accumulator_with_encoding<Torus>(
+          streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
+          decompression_rescale_lut->get_degree(0),
+          decompression_rescale_lut->get_max_degree(0),
+          encryption_params.glwe_dimension, encryption_params.polynomial_size,
+          effective_compression_message_modulus,
+          effective_compression_carry_modulus,
+          encryption_params.message_modulus, encryption_params.carry_modulus,
+          decompression_rescale_f, gpu_memory_allocated);
+
+      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
+    }
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
@@ -134,9 +127,11 @@ template <typename Torus> struct int_decompression {
                                       gpu_indexes[0], gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(tmp_indexes_array, streams[0],
                                       gpu_indexes[0], gpu_memory_allocated);
-
-    decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
-    delete decompression_rescale_lut;
+    if constexpr (std::is_same_v<Torus, uint64_t>) {
+      decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
+      delete decompression_rescale_lut;
+      decompression_rescale_lut = nullptr;
+    }
  }
 };
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -37,6 +37,10 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };

 enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

+enum Direction { Trailing = 0, Leading = 1 };
+
+enum BitValue { Zero = 0, One = 1 };
+
 extern "C" {

 typedef struct {
@@ -83,7 +87,8 @@ uint64_t scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);
+    uint64_t lut_degree, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
 uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -92,7 +97,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);
 void cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    CudaRadixCiphertextFFI *output_radix_lwe,
@@ -113,7 +118,8 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);
+    uint64_t lut_degree, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -143,7 +149,7 @@ uint64_t scratch_cuda_full_propagation_64(
    uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_full_propagation_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -162,7 +168,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_kb_64(
    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
    uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_mult_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -196,7 +202,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -211,7 +217,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -234,7 +240,8 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool is_signed, bool allocate_gpu_memory, bool allocate_ms_array);
+    bool is_signed, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -255,7 +262,7 @@ uint64_t scratch_cuda_integer_radix_comparison_kb_64(
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    COMPARISON_TYPE op_type, bool is_signed, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_comparison_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -285,7 +292,8 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    BITOP_TYPE op_type, bool allocate_gpu_memory, bool allocate_ms_array);
+    BITOP_TYPE op_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -314,7 +322,7 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_cmux_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -336,7 +344,7 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, SHIFT_OR_ROTATE_TYPE shift_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -356,7 +364,8 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
-    uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array);
+    uint32_t uses_carry, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -365,7 +374,8 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
-    uint32_t uses_carry, bool allocate_gpu_memory, bool allocate_ms_array);
+    uint32_t uses_carry, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -400,7 +410,7 @@ uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t compute_overflow,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_overflowing_sub_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -424,7 +434,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool reduce_degrees_for_single_carry_propagation, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -444,7 +454,7 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -466,7 +476,8 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_kb_64(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_div_rem_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -487,7 +498,8 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory, bool allocate_ms_array);
+    uint64_t lut_degree, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -513,7 +525,8 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -533,7 +546,7 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_are_all_comparisons_block_true_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -554,7 +567,7 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -585,7 +598,7 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_radix_blocks, uint32_t num_original_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_apply_noise_squashing_kb(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -607,7 +620,7 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_sub_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -629,7 +642,7 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_unsigned_scalar_div_radix_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -649,7 +662,7 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t num_additional_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_extend_radix_with_sign_msb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -670,7 +683,7 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_signed_scalar_div_radix_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -691,7 +704,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -716,7 +729,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
    PBS_TYPE pbs_type, const CudaScalarDivisorFFI *scalar_divisor_ffi,
    uint32_t const active_bits_divisor, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_integer_signed_scalar_div_rem_radix_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -731,5 +744,70 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
 void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr_void);
+
+uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks, uint32_t counter_num_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, PBS_TYPE pbs_type, Direction direction,
+    BitValue bit_value, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_integer_count_of_consecutive_bits_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output_ct, CudaRadixCiphertextFFI const *input_ct,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);
+
+void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+uint64_t scratch_cuda_integer_grouped_oprf_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_blocks_to_process, uint32_t num_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory, uint32_t message_bits_per_block,
+    uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_integer_grouped_oprf_async_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *radix_lwe_out, const void *seeded_lwe_input,
+    uint32_t num_blocks_to_process, int8_t *mem, void *const *bsks,
+    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
+
+void cleanup_cuda_integer_grouped_oprf_64(void *const *streams,
+                                          uint32_t const *gpu_indexes,
+                                          uint32_t gpu_count,
+                                          int8_t **mem_ptr_void);
+
+uint64_t scratch_integer_ilog2_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t input_num_blocks, uint32_t counter_num_blocks,
+    uint32_t num_bits_in_ciphertext, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type);
+
+void cuda_integer_ilog2_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    CudaRadixCiphertextFFI *output_ct, CudaRadixCiphertextFFI const *input_ct,
+    CudaRadixCiphertextFFI const *trivial_ct_neg_n,
+    CudaRadixCiphertextFFI const *trivial_ct_2,
+    CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks,
+    const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);
+
+void cleanup_cuda_integer_ilog2_kb_64(void *const *streams,
+                                      uint32_t const *gpu_indexes,
+                                      uint32_t gpu_count,
+                                      int8_t **mem_ptr_void);
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -12,6 +12,8 @@
 #include <functional>
 #include <queue>

+#include <stdio.h>
+
 class NoiseLevel {
 public:
  // Constants equivalent to the Rust code
@@ -92,6 +94,13 @@ void generate_device_accumulator_with_encoding(
    uint32_t output_message_modulus, uint32_t output_carry_modulus,
    std::function<Torus(Torus)> f, bool gpu_memory_allocated);

+template <typename Torus>
+void generate_device_accumulator_no_encoding(
+    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
+    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
+    uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
+    bool gpu_memory_allocated);
+
 /*
 *  generate univariate accumulator (lut) for device pointer
 *    stream - cuda stream
@@ -234,7 +243,7 @@ struct int_radix_params {
  uint32_t grouping_factor;
  uint32_t message_modulus;
  uint32_t carry_modulus;
-  bool allocate_ms_array;
+  PBS_MS_REDUCTION_T noise_reduction_type;

  int_radix_params(){};

@@ -244,7 +253,7 @@ struct int_radix_params {
                   uint32_t ks_base_log, uint32_t pbs_level,
                   uint32_t pbs_base_log, uint32_t grouping_factor,
                   uint32_t message_modulus, uint32_t carry_modulus,
-                   bool allocate_ms_array)
+                   PBS_MS_REDUCTION_T noise_reduction_type)

      : pbs_type(pbs_type), glwe_dimension(glwe_dimension),
        polynomial_size(polynomial_size), big_lwe_dimension(big_lwe_dimension),
@@ -252,7 +261,7 @@ struct int_radix_params {
        ks_base_log(ks_base_log), pbs_level(pbs_level),
        pbs_base_log(pbs_base_log), grouping_factor(grouping_factor),
        message_modulus(message_modulus), carry_modulus(carry_modulus),
-        allocate_ms_array(allocate_ms_array){};
+        noise_reduction_type(noise_reduction_type){};

  void print() {
    printf("pbs_type: %u, glwe_dimension: %u, "
@@ -325,8 +334,8 @@ template <typename Torus> struct int_radix_lut {
    this->num_blocks = num_radix_blocks;
    this->num_luts = num_luts;
    gpu_memory_allocated = allocate_gpu_memory;
-    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
-    Torus lut_buffer_size =
+    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint64_t lut_buffer_size =
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
@@ -336,17 +345,18 @@ template <typename Torus> struct int_radix_lut {
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_set_device(i);
+      cuda_set_device(gpu_indexes[i]);
      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      auto num_blocks_on_gpu = std::max(
+          THRESHOLD_MULTI_GPU,
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));

      uint64_t size = 0;
      execute_scratch_pbs<Torus>(
          streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
          params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
          params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
-          allocate_gpu_memory, params.allocate_ms_array, size);
+          allocate_gpu_memory, params.noise_reduction_type, size);
      if (i == 0) {
        size_tracker += size;
      }
@@ -448,8 +458,8 @@ template <typename Torus> struct int_radix_lut {
    this->num_blocks = num_radix_blocks;
    this->num_luts = num_luts;
    gpu_memory_allocated = allocate_gpu_memory;
-    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
-    Torus lut_buffer_size =
+    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint64_t lut_buffer_size =
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
@@ -541,8 +551,8 @@ template <typename Torus> struct int_radix_lut {
    this->num_blocks = num_radix_blocks;
    this->num_luts = num_luts;
    gpu_memory_allocated = allocate_gpu_memory;
-    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
-    Torus lut_buffer_size =
+    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint64_t lut_buffer_size =
        (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
@@ -552,17 +562,18 @@ template <typename Torus> struct int_radix_lut {
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_set_device(i);
+      cuda_set_device(gpu_indexes[i]);
      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      auto num_blocks_on_gpu = std::max(
+          THRESHOLD_MULTI_GPU,
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));

      uint64_t size = 0;
      execute_scratch_pbs<Torus>(
          streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
          params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
          params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
-          allocate_gpu_memory, params.allocate_ms_array, size);
+          allocate_gpu_memory, params.noise_reduction_type, size);
      if (i == 0) {
        size_tracker += size;
      }
@@ -703,7 +714,7 @@ template <typename Torus> struct int_radix_lut {
  void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes) {
    int active_device = cuda_get_device();

-    Torus lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
+    uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;

    auto src_lut = lut_vec[0];
    auto src_lut_indexes = lut_indexes_vec[0];
@@ -752,22 +763,20 @@ template <typename Torus> struct int_radix_lut {
    if (!mem_reuse) {
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     tmp_lwe_before_ks, gpu_memory_allocated);
-      if (gpu_memory_allocated) {
-        for (int i = 0; i < buffer.size(); i++) {
-          switch (params.pbs_type) {
-          case MULTI_BIT:
-            cleanup_cuda_multi_bit_programmable_bootstrap(
-                streams[i], gpu_indexes[i], &buffer[i]);
-            break;
-          case CLASSICAL:
-            cleanup_cuda_programmable_bootstrap(streams[i], gpu_indexes[i],
-                                                &buffer[i]);
-            break;
-          default:
-            PANIC("Cuda error (PBS): unknown PBS type. ")
-          }
-          cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      for (int i = 0; i < buffer.size(); i++) {
+        switch (params.pbs_type) {
+        case MULTI_BIT:
+          cleanup_cuda_multi_bit_programmable_bootstrap(
+              streams[i], gpu_indexes[i], &buffer[i]);
+          break;
+        case CLASSICAL:
+          cleanup_cuda_programmable_bootstrap(streams[i], gpu_indexes[i],
+                                              &buffer[i]);
+          break;
+        default:
+          PANIC("Cuda error (PBS): unknown PBS type. ")
        }
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      }
      delete tmp_lwe_before_ks;
      buffer.clear();
@@ -821,7 +830,6 @@ template <typename InputTorus> struct int_noise_squashing_lut {
  InputTorus *lwe_indexes_in;

  InputTorus *h_lwe_indexes_in;
-  InputTorus *h_lwe_indexes_out;
  InputTorus *lwe_trivial_indexes;

  /// For multi GPU execution we create vectors of pointers for inputs and
@@ -852,7 +860,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {
        input_glwe_dimension * input_polynomial_size;
    this->input_big_lwe_dimension = input_big_lwe_dimension;

-    uint32_t lut_buffer_size = (params.glwe_dimension + 1) *
+    uint64_t lut_buffer_size = (params.glwe_dimension + 1) *
                               params.polynomial_size * sizeof(__uint128_t);

    gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
@@ -862,16 +870,17 @@ template <typename InputTorus> struct int_noise_squashing_lut {
    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_set_device(i);
-      auto num_radix_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
+      cuda_set_device(gpu_indexes[i]);
+      auto num_radix_blocks_on_gpu = std::max(
+          THRESHOLD_MULTI_GPU,
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
      int8_t *gpu_pbs_buffer;
      uint64_t size = 0;
      execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
                              params.small_lwe_dimension, params.glwe_dimension,
                              params.polynomial_size, params.pbs_level,
                              num_radix_blocks_on_gpu, allocate_gpu_memory,
-                              params.allocate_ms_array, size);
+                              params.noise_reduction_type, size);
      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      if (i == 0) {
        size_tracker += size;
@@ -1294,7 +1303,7 @@ template <typename Torus> struct int_fullprop_buffer {
        params.message_modulus, params.carry_modulus, lut_f_carry,
        gpu_memory_allocated);

-    Torus lwe_indexes_size = 2 * sizeof(Torus);
+    uint64_t lwe_indexes_size = 2 * sizeof(Torus);
    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
    for (int i = 0; i < 2; i++)
      h_lwe_indexes[i] = i;
@@ -1942,7 +1951,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        gpu_memory_allocated);

    // Generate the indexes to switch between luts within the pbs
-    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);

    Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
    for (int index = 0; index < num_radix_blocks; index++) {
@@ -2059,7 +2068,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
    h_scalar_array_cum_sum = new Torus[num_radix_blocks]();

    // create lut objects for step 2
-    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
    uint32_t num_carry_to_resolve = num_groups - 1;
    uint32_t saturated_sub =
        ((num_carry_to_resolve > 1) ? num_carry_to_resolve - 1 : 0);
@@ -2348,9 +2357,11 @@ template <typename Torus> struct int_sc_prop_memory {
        grouping_size, num_groups, allocate_gpu_memory, size_tracker);

    //  Step 3 elements
+    int num_luts_message_extract =
+        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
    lut_message_extract = new int_radix_lut<Torus>(
-        streams, gpu_indexes, gpu_count, params, 2, num_radix_blocks + 1,
-        allocate_gpu_memory, size_tracker);
+        streams, gpu_indexes, gpu_count, params, num_luts_message_extract,
+        num_radix_blocks + 1, allocate_gpu_memory, size_tracker);
    // lut for the first block in the first grouping
    auto f_message_extract = [message_modulus](Torus block) -> Torus {
      return (block >> 1) % message_modulus;
@@ -2363,8 +2374,6 @@ template <typename Torus> struct int_sc_prop_memory {
        message_modulus, carry_modulus, f_message_extract,
        gpu_memory_allocated);

-    lut_message_extract->broadcast_lut(streams, gpu_indexes);
-
    // This store a single block that with be used to store the overflow or
    // carry results
    output_flag = new CudaRadixCiphertextFFI;
@@ -2464,8 +2473,6 @@ template <typename Torus> struct int_sc_prop_memory {
          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
          (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
          allocate_gpu_memory);
-
-      lut_message_extract->broadcast_lut(streams, gpu_indexes);
    }
    if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case

@@ -2492,9 +2499,8 @@ template <typename Torus> struct int_sc_prop_memory {
          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
          (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
          allocate_gpu_memory);
-
-      lut_message_extract->broadcast_lut(streams, gpu_indexes);
    }
+    lut_message_extract->broadcast_lut(streams, gpu_indexes);
  };

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -2663,7 +2669,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
        gpu_memory_allocated);

    // Generate the indexes to switch between luts within the pbs
-    Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
+    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
    Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;

    for (int index = 0; index < num_radix_blocks; index++) {
@@ -3767,8 +3773,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;

-    Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-
    block_selector_f = [](Torus msb, Torus lsb) -> Torus {
      if (msb == IS_EQUAL) // EQUAL
        return lsb;
@@ -3864,8 +3868,6 @@ template <typename Torus> struct int_comparison_diff_buffer {
      }
    };

-    Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-
    tmp_packed = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams[0], gpu_indexes[0], tmp_packed, num_radix_blocks,
@@ -4908,12 +4910,22 @@ template <typename Torus> struct int_scalar_mul_buffer {
    sc_prop_mem->release(streams, gpu_indexes, gpu_count);
    delete sc_prop_mem;
    delete all_shifted_buffer;
-    if (!anticipated_buffers_drop) {
+    release_buffers(streams, gpu_indexes, gpu_count);
+  }
+
+  void release_buffers(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                       uint32_t gpu_count) {
+    if (preshifted_buffer) {
      release_radix_ciphertext_async(streams[0], gpu_indexes[0],
                                     preshifted_buffer, gpu_memory_allocated);
+      delete preshifted_buffer;
+      preshifted_buffer = nullptr;
+    }
+
+    if (logical_scalar_shift_buffer) {
      logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
      delete logical_scalar_shift_buffer;
-      delete preshifted_buffer;
+      logical_scalar_shift_buffer = nullptr;
    }
  }
 };
@@ -4988,7 +5000,6 @@ template <typename Torus> struct int_div_rem_memory {
  // sub streams
  cudaStream_t *sub_streams_1;
  cudaStream_t *sub_streams_2;
-  cudaStream_t *sub_streams_3;

  // temporary device buffers
  CudaRadixCiphertextFFI *positive_numerator;
@@ -5004,7 +5015,7 @@ template <typename Torus> struct int_div_rem_memory {
                     bool allocate_gpu_memory, uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
-    this->active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
+    this->active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
    this->params = params;
    this->is_signed = is_signed;

@@ -5069,16 +5080,11 @@ template <typename Torus> struct int_div_rem_memory {
          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

      // init sub streams
-      sub_streams_1 =
-          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-      sub_streams_2 =
-          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-      sub_streams_3 =
-          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < active_gpu_count; j++) {
+      sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
+      sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < gpu_count; j++) {
        sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
        sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
-        sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
      }

      // init lookup tables
@@ -5142,14 +5148,12 @@ template <typename Torus> struct int_div_rem_memory {
      delete compare_signed_bits_lut;

      // release sub streams
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
        cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
        cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
-        cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
      }
      free(sub_streams_1);
      free(sub_streams_2);
-      free(sub_streams_3);

      // delete temporary buffers
      delete positive_numerator;
@@ -5203,6 +5207,7 @@ template <typename Torus> struct int_scalar_mul_high_buffer {

    scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
    delete scalar_mul_mem;
+    scalar_mul_mem = nullptr;

    release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp,
                                   allocate_gpu_memory);
@@ -5724,23 +5729,516 @@ template <typename Torus> struct int_signed_scalar_div_rem_buffer {
    release_radix_ciphertext_async(streams[0], gpu_indexes[0], numerator_ct,
                                   allocate_gpu_memory);
    delete numerator_ct;
+    numerator_ct = nullptr;

    signed_div_mem->release(streams, gpu_indexes, gpu_count);
    delete signed_div_mem;
+    signed_div_mem = nullptr;

    scp_mem->release(streams, gpu_indexes, gpu_count);
    delete scp_mem;
+    scp_mem = nullptr;

    if (logical_scalar_shift_mem != nullptr) {
      logical_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
      delete logical_scalar_shift_mem;
+      logical_scalar_shift_mem = nullptr;
    }
    if (scalar_mul_mem != nullptr) {
      scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
      delete scalar_mul_mem;
+      scalar_mul_mem = nullptr;
    }
    sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
    delete sub_and_propagate_mem;
+    sub_and_propagate_mem = nullptr;
+  }
+};
+
+template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  int_radix_lut<Torus> *univ_lut_mem;
+  int_radix_lut<Torus> *biv_lut_mem;
+
+  Direction direction;
+  BitValue bit_value;
+
+  CudaRadixCiphertextFFI *tmp_ct;
+
+  int_prepare_count_of_consecutive_bits_buffer(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, const int_radix_params params,
+      uint32_t num_radix_blocks, Direction direction, BitValue bit_value,
+      const bool allocate_gpu_memory, uint64_t &size_tracker) {
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->direction = direction;
+    this->bit_value = bit_value;
+
+    this->univ_lut_mem = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks,
+        allocate_gpu_memory, size_tracker);
+    this->biv_lut_mem = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks,
+        allocate_gpu_memory, size_tracker);
+
+    const uint32_t num_bits = std::log2(this->params.message_modulus);
+
+    auto generate_uni_lut_lambda = [this, num_bits](Torus x) -> Torus {
+      x %= this->params.message_modulus;
+      uint64_t count = 0;
+
+      if (this->direction == Trailing) {
+        for (uint32_t i = 0; i < num_bits; ++i) {
+          if (((x >> i) & 1) != this->bit_value) {
+            break;
+          }
+          count++;
+        }
+      } else {
+        for (int32_t i = num_bits - 1; i >= 0; --i) {
+          if (((x >> i) & 1) != this->bit_value) {
+            break;
+          }
+          count++;
+        }
+      }
+      return count;
+    };
+
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], univ_lut_mem->get_lut(0, 0),
+        univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
+
+    if (allocate_gpu_memory) {
+      univ_lut_mem->broadcast_lut(streams, gpu_indexes);
+    }
+
+    auto generate_bi_lut_lambda =
+        [num_bits](Torus block_num_bit_count,
+                   Torus more_significant_block_bit_count) -> Torus {
+      if (more_significant_block_bit_count == num_bits) {
+        return block_num_bit_count;
+      }
+      return 0;
+    };
+
+    generate_device_accumulator_bivariate<Torus>(
+        streams[0], gpu_indexes[0], biv_lut_mem->get_lut(0, 0),
+        biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
+        params.glwe_dimension, params.polynomial_size, params.message_modulus,
+        params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
+
+    if (allocate_gpu_memory) {
+      biv_lut_mem->broadcast_lut(streams, gpu_indexes);
+    }
+
+    this->tmp_ct = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], tmp_ct, num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    univ_lut_mem->release(streams, gpu_indexes, gpu_count);
+    delete univ_lut_mem;
+
+    biv_lut_mem->release(streams, gpu_indexes, gpu_count);
+    delete biv_lut_mem;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp_ct,
+                                   allocate_gpu_memory);
+    delete tmp_ct;
+  }
+};
+
+template <typename Torus> struct int_count_of_consecutive_bits_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t counter_num_blocks;
+
+  int_prepare_count_of_consecutive_bits_buffer<Torus> *prepare_mem = nullptr;
+  CudaRadixCiphertextFFI *ct_prepared = nullptr;
+
+  int_sum_ciphertexts_vec_memory<Torus> *sum_mem = nullptr;
+  int_sc_prop_memory<Torus> *propagate_mem = nullptr;
+  CudaRadixCiphertextFFI *cts = nullptr;
+
+  int_count_of_consecutive_bits_buffer(
+      cudaStream_t const *streams, uint32_t const *gpu_indexes,
+      uint32_t gpu_count, const int_radix_params params,
+      uint32_t num_radix_blocks, uint32_t counter_num_blocks,
+      Direction direction, BitValue bit_value, const bool allocate_gpu_memory,
+      uint64_t &size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->counter_num_blocks = counter_num_blocks;
+
+    this->ct_prepared = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], ct_prepared, num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->prepare_mem = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, params, num_radix_blocks, direction,
+        bit_value, allocate_gpu_memory, size_tracker);
+
+    this->cts = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], cts, counter_num_blocks * num_radix_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, counter_num_blocks,
+        num_radix_blocks, true, allocate_gpu_memory, size_tracker);
+
+    this->propagate_mem = new int_sc_prop_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, counter_num_blocks, 0, 0,
+        allocate_gpu_memory, size_tracker);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], ct_prepared,
+                                   allocate_gpu_memory);
+    delete ct_prepared;
+    ct_prepared = nullptr;
+
+    prepare_mem->release(streams, gpu_indexes, gpu_count);
+    delete prepare_mem;
+    prepare_mem = nullptr;
+
+    sum_mem->release(streams, gpu_indexes, gpu_count);
+    delete sum_mem;
+    sum_mem = nullptr;
+
+    propagate_mem->release(streams, gpu_indexes, gpu_count);
+    delete propagate_mem;
+    propagate_mem = nullptr;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0], cts,
+                                   allocate_gpu_memory);
+    delete cts;
+    cts = nullptr;
+  }
+};
+
+template <typename Torus> struct int_grouped_oprf_memory {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+
+  int_radix_lut<Torus> *luts;
+  CudaRadixCiphertextFFI *plaintext_corrections;
+  Torus *h_lut_indexes;
+
+  // with message_bits_per_block == ilog2(msg_modulus) from crypto params
+  int_grouped_oprf_memory(cudaStream_t const *streams,
+                          uint32_t const *gpu_indexes, uint32_t gpu_count,
+                          int_radix_params params,
+                          uint32_t num_blocks_to_process, uint32_t num_blocks,
+                          uint32_t message_bits_per_block,
+                          uint64_t total_random_bits, bool allocate_gpu_memory,
+                          uint64_t &size_tracker) {
+
+    if (num_blocks < num_blocks_to_process) {
+      PANIC("num_blocks should be greater than num_blocks_to_process");
+    }
+
+    uint32_t calculated_active_blocks =
+        total_random_bits == 0
+            ? 0
+            : (total_random_bits + message_bits_per_block - 1) /
+                  message_bits_per_block;
+    if (num_blocks_to_process != calculated_active_blocks) {
+      PANIC(
+          "num_blocks_to_process should be equal to calculated_active_blocks");
+    }
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+
+    this->luts = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, message_bits_per_block,
+        num_blocks, allocate_gpu_memory, size_tracker);
+
+    this->plaintext_corrections = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], this->plaintext_corrections, num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    uint64_t message_modulus_log2 = (uint64_t)std::log2(params.message_modulus);
+    if (message_modulus_log2 != message_bits_per_block) {
+      PANIC("message_modulus_log2 should be equal to message_bits_per_block");
+    }
+    uint64_t carry_modulus_log2 = (uint64_t)std::log2(params.carry_modulus);
+    uint64_t full_bits_count = 1 + carry_modulus_log2 + message_modulus_log2;
+    uint64_t delta = 1ULL << (64 - full_bits_count);
+    size_t lwe_size = params.big_lwe_dimension + 1;
+
+    // Pre-generate all possible LUTs.
+    //
+    for (uint32_t random_bit = 1; random_bit <= message_bits_per_block;
+         ++random_bit) {
+      uint64_t p = 1ULL << random_bit;
+      uint64_t poly_delta =
+          2 * static_cast<uint64_t>(params.polynomial_size) / p;
+
+      if (2 * static_cast<uint64_t>(params.polynomial_size) < p) {
+        PANIC("2 * static_cast<uint64_t>(params.polynomial_size) should be "
+              "smaller than p");
+      }
+
+      auto lut_f = [poly_delta, delta](uint32_t x) -> Torus {
+        return (2 * (x / poly_delta) + 1) * delta / 2;
+      };
+
+      uint64_t degree = 0;
+      uint32_t lut_index = random_bit - 1;
+      generate_device_accumulator_no_encoding<Torus>(
+          streams[0], gpu_indexes[0], luts->get_lut(0, lut_index), degree,
+          params.message_modulus, params.carry_modulus, params.glwe_dimension,
+          params.polynomial_size, lut_f, allocate_gpu_memory);
+      *luts->get_degree(lut_index) = degree;
+    }
+
+    // For each block, this loop determines the exact number of bits to generate
+    // (handling both bounded and unbounded cases), which pre-computed LUT to
+    // use, and the final plaintext correction to add.
+    //
+    Torus *h_corrections =
+        (Torus *)calloc(num_blocks * lwe_size, sizeof(Torus));
+    this->h_lut_indexes = (Torus *)calloc(num_blocks, sizeof(Torus));
+
+    uint64_t bits_processed = 0;
+    for (uint32_t i = 0; i < num_blocks_to_process; ++i) {
+
+      if (total_random_bits <= bits_processed) {
+        PANIC("total_random_bits should be greater than bits_processed");
+      }
+      uint64_t bits_remaining = total_random_bits - bits_processed;
+      uint32_t bits_for_this_block =
+          std::min((uint64_t)message_bits_per_block, bits_remaining);
+
+      uint64_t p = 1ULL << bits_for_this_block;
+      Torus plaintext_to_add = (p - 1) * delta / 2;
+
+      h_corrections[i * lwe_size + params.big_lwe_dimension] = plaintext_to_add;
+      if (bits_for_this_block < 1) {
+        PANIC("bits_for_this_block should be greater than 1");
+      }
+      this->h_lut_indexes[i] = bits_for_this_block - 1;
+
+      bits_processed += bits_for_this_block;
+    }
+
+    // h_corrections contains num_blocks lwes of dimension big_lwe_dim
+    // of which num_blocks_to_process lwes have a body that his set
+    // to a correction and all others to 0.
+    // All lwes in h_corrections have a mask equal to 0.
+    // Copy the prepared plaintext corrections to the GPU.
+    cuda_memcpy_async_to_gpu(this->plaintext_corrections->ptr, h_corrections,
+                             num_blocks * lwe_size * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+
+    // Copy the prepared LUT indexes to the GPU 0, before broadcast to all other
+    // GPUs.
+    cuda_memcpy_async_to_gpu(luts->get_lut_indexes(0, 0), this->h_lut_indexes,
+                             num_blocks * sizeof(Torus), streams[0],
+                             gpu_indexes[0]);
+    luts->broadcast_lut(streams, gpu_indexes);
+
+    free(h_corrections);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    this->luts->release(streams, gpu_indexes, gpu_count);
+    delete this->luts;
+    this->luts = nullptr;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   this->plaintext_corrections,
+                                   this->allocate_gpu_memory);
+    delete this->plaintext_corrections;
+    this->plaintext_corrections = nullptr;
+
+    free(this->h_lut_indexes);
+    this->h_lut_indexes = nullptr;
+  }
+};
+
+template <typename Torus> struct int_ilog2_buffer {
+  int_radix_params params;
+  bool allocate_gpu_memory;
+  uint32_t input_num_blocks;
+  uint32_t counter_num_blocks;
+  uint32_t num_bits_in_ciphertext;
+
+  int_prepare_count_of_consecutive_bits_buffer<Torus> *prepare_mem;
+  int_sum_ciphertexts_vec_memory<Torus> *sum_mem;
+  int_fullprop_buffer<Torus> *final_propagate_mem;
+
+  CudaRadixCiphertextFFI *ct_in_buffer;
+  CudaRadixCiphertextFFI *sum_input_cts;
+  CudaRadixCiphertextFFI *sum_output_not_propagated;
+  CudaRadixCiphertextFFI *message_blocks_not;
+  CudaRadixCiphertextFFI *carry_blocks_not;
+  CudaRadixCiphertextFFI *rotated_carry_blocks;
+
+  int_radix_lut<Torus> *lut_message_not;
+  int_radix_lut<Torus> *lut_carry_not;
+
+  int_ilog2_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                   uint32_t gpu_count, const int_radix_params params,
+                   uint32_t input_num_blocks, uint32_t counter_num_blocks,
+                   uint32_t num_bits_in_ciphertext,
+                   const bool allocate_gpu_memory, uint64_t &size_tracker) {
+
+    this->params = params;
+    this->allocate_gpu_memory = allocate_gpu_memory;
+    this->input_num_blocks = input_num_blocks;
+    this->counter_num_blocks = counter_num_blocks;
+    this->num_bits_in_ciphertext = num_bits_in_ciphertext;
+
+    this->ct_in_buffer = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], this->ct_in_buffer, input_num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->prepare_mem = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
+        streams, gpu_indexes, gpu_count, params, input_num_blocks, Leading,
+        Zero, allocate_gpu_memory, size_tracker);
+
+    uint32_t sum_input_total_blocks =
+        (input_num_blocks + 1) * counter_num_blocks;
+    this->sum_input_cts = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], this->sum_input_cts, sum_input_total_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus>(
+        streams, gpu_indexes, gpu_count, params, counter_num_blocks,
+        input_num_blocks + 1, false, allocate_gpu_memory, size_tracker);
+
+    this->sum_output_not_propagated = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], this->sum_output_not_propagated,
+        counter_num_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->lut_message_not = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, counter_num_blocks,
+        allocate_gpu_memory, size_tracker);
+    std::function<Torus(Torus)> lut_message_lambda =
+        [this](uint64_t x) -> uint64_t {
+      uint64_t message = x % this->params.message_modulus;
+      return (~message) % this->params.message_modulus;
+    };
+    generate_device_accumulator(
+        streams[0], gpu_indexes[0], this->lut_message_not->get_lut(0, 0),
+        this->lut_message_not->get_degree(0),
+        this->lut_message_not->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_message_lambda, allocate_gpu_memory);
+
+    this->lut_carry_not = new int_radix_lut<Torus>(
+        streams, gpu_indexes, gpu_count, params, 1, counter_num_blocks,
+        allocate_gpu_memory, size_tracker);
+    std::function<Torus(Torus)> lut_carry_lambda =
+        [this](uint64_t x) -> uint64_t {
+      uint64_t carry = x / this->params.message_modulus;
+      return (~carry) % this->params.message_modulus;
+    };
+    generate_device_accumulator(
+        streams[0], gpu_indexes[0], this->lut_carry_not->get_lut(0, 0),
+        this->lut_carry_not->get_degree(0),
+        this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_carry_lambda, allocate_gpu_memory);
+
+    this->message_blocks_not = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], this->message_blocks_not,
+        counter_num_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->carry_blocks_not = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], this->carry_blocks_not, counter_num_blocks,
+        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
+
+    this->rotated_carry_blocks = new CudaRadixCiphertextFFI;
+    create_zero_radix_ciphertext_async<Torus>(
+        streams[0], gpu_indexes[0], this->rotated_carry_blocks,
+        counter_num_blocks, params.big_lwe_dimension, size_tracker,
+        allocate_gpu_memory);
+
+    this->final_propagate_mem =
+        new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
+                                       allocate_gpu_memory, size_tracker);
+  }
+
+  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+               uint32_t gpu_count) {
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   this->ct_in_buffer, allocate_gpu_memory);
+    delete this->ct_in_buffer;
+    this->ct_in_buffer = nullptr;
+
+    this->prepare_mem->release(streams, gpu_indexes, gpu_count);
+    delete this->prepare_mem;
+    this->prepare_mem = nullptr;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   this->sum_input_cts, allocate_gpu_memory);
+    delete this->sum_input_cts;
+    this->sum_input_cts = nullptr;
+
+    this->sum_mem->release(streams, gpu_indexes, gpu_count);
+    delete this->sum_mem;
+    this->sum_mem = nullptr;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   this->sum_output_not_propagated,
+                                   allocate_gpu_memory);
+    delete this->sum_output_not_propagated;
+    this->sum_output_not_propagated = nullptr;
+
+    this->lut_message_not->release(streams, gpu_indexes, gpu_count);
+    delete this->lut_message_not;
+    this->lut_message_not = nullptr;
+
+    this->lut_carry_not->release(streams, gpu_indexes, gpu_count);
+    delete this->lut_carry_not;
+    this->lut_carry_not = nullptr;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   this->message_blocks_not,
+                                   allocate_gpu_memory);
+    delete this->message_blocks_not;
+    this->message_blocks_not = nullptr;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   this->carry_blocks_not, allocate_gpu_memory);
+    delete this->carry_blocks_not;
+    this->carry_blocks_not = nullptr;
+
+    release_radix_ciphertext_async(streams[0], gpu_indexes[0],
+                                   this->rotated_carry_blocks,
+                                   allocate_gpu_memory);
+    delete this->rotated_carry_blocks;
+    this->rotated_carry_blocks = nullptr;
+
+    this->final_propagate_mem->release(streams, gpu_indexes, gpu_count);
+    delete this->final_propagate_mem;
+    this->final_propagate_mem = nullptr;
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_128_utilities.h
@@ -8,6 +8,6 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
    void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 #endif // CUDA_BOOTSTRAP_128_H
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_enums.h
@@ -3,6 +3,7 @@
 #include <stdint.h>
 enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
 enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
+enum PBS_MS_REDUCTION_T { NO_REDUCTION = 0, DRIFT = 1, CENTERED = 2 };

 extern "C" {
 typedef struct {
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -83,23 +83,24 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
  Torus *temp_lwe_array_in;

  PBS_VARIANT pbs_variant;
-  bool uses_noise_reduction;
+  PBS_MS_REDUCTION_T noise_reduction_type;
  bool gpu_memory_allocated;

  pbs_buffer(cudaStream_t stream, uint32_t gpu_index, uint32_t lwe_dimension,
             uint32_t glwe_dimension, uint32_t polynomial_size,
             uint32_t level_count, uint32_t input_lwe_ciphertext_count,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory,
-             bool allocate_ms_array, uint64_t &size_tracker) {
+             PBS_MS_REDUCTION_T noise_reduction_type, uint64_t &size_tracker)
+      : noise_reduction_type(noise_reduction_type) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
-    this->uses_noise_reduction = allocate_ms_array;
    this->pbs_variant = pbs_variant;

    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
    this->temp_lwe_array_in = (Torus *)cuda_malloc_with_size_tracking_async(
        (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(Torus),
-        stream, gpu_index, size_tracker, allocate_ms_array);
+        stream, gpu_index, size_tracker,
+        noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT);
    switch (pbs_variant) {
    case PBS_VARIANT::DEFAULT: {
      uint64_t full_sm_step_one =
@@ -234,7 +235,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);

-    if (uses_noise_reduction)
+    if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
                                         gpu_memory_allocated);
  }
@@ -252,26 +253,30 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
  uint64_t *trivial_indexes;

  PBS_VARIANT pbs_variant;
-  bool uses_noise_reduction;
+  PBS_MS_REDUCTION_T noise_reduction_type;
  bool gpu_memory_allocated;

  pbs_buffer_128(cudaStream_t stream, uint32_t gpu_index,
                 uint32_t lwe_dimension, uint32_t glwe_dimension,
                 uint32_t polynomial_size, uint32_t level_count,
                 uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
-                 bool allocate_gpu_memory, bool allocate_ms_array,
-                 uint64_t &size_tracker) {
+                 bool allocate_gpu_memory,
+                 PBS_MS_REDUCTION_T noise_reduction_type,
+                 uint64_t &size_tracker)
+      : noise_reduction_type(noise_reduction_type) {
    gpu_memory_allocated = allocate_gpu_memory;
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
-    this->uses_noise_reduction = allocate_ms_array;
-    if (allocate_ms_array) {
-      this->temp_lwe_array_in = (InputTorus *)cuda_malloc_async(
-          (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(InputTorus),
-          stream, gpu_index);
+
+    if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
+      this->temp_lwe_array_in =
+          (InputTorus *)cuda_malloc_with_size_tracking_async(
+              (lwe_dimension + 1) * input_lwe_ciphertext_count *
+                  sizeof(InputTorus),
+              stream, gpu_index, size_tracker, allocate_gpu_memory);
      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
-          size_tracker, allocate_ms_array);
+          size_tracker, allocate_gpu_memory);
      uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
      for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
        h_trivial_indexes[i] = i;
@@ -420,7 +425,7 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);

-    if (uses_noise_reduction) {
+    if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
                                         gpu_memory_allocated);
      cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
@@ -500,7 +505,7 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
 #endif

 template <typename Torus>
@@ -508,14 +513,14 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 template <typename Torus>
 uint64_t scratch_cuda_programmable_bootstrap(
    void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
@@ -544,6 +549,11 @@ __device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
                                     int glwe_dimension, uint32_t level_count);

+template <typename T, uint32_t polynomial_size, uint32_t glwe_dimension,
+          uint32_t level_count, uint32_t level_id>
+__device__ const T *get_ith_mask_kth_block_2_2_params(const T *ptr,
+                                                      int iteration, int k);
+
 template <typename T>
 __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
                                     uint32_t polynomial_size,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap.h
@@ -61,19 +61,19 @@ uint64_t scratch_cuda_programmable_bootstrap_32(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 uint64_t scratch_cuda_programmable_bootstrap_64(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 uint64_t scratch_cuda_programmable_bootstrap_128(
    void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
-    bool allocate_ms_array);
+    PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -24,7 +24,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
    const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
    uint32_t num_compact_lists, uint32_t message_modulus,
    uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
-    bool allocate_gpu_memory, bool allocate_ms_array);
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);

 void cuda_expand_without_verification_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -138,7 +138,7 @@ template <typename Torus> struct zk_expand_mem {
    auto compact_list_id = 0;
    auto idx = 0;
    auto count = 0;
-    // During flatenning, all num_lwes LWEs from all compact lists are stored
+    // During flattening, all num_lwes LWEs from all compact lists are stored
    // sequentially on a Torus array. h_lwe_compact_input_indexes stores the
    // index of the first LWE related to the compact list that contains the i-th
    // LWE
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -84,6 +84,25 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
      static_cast<uint64_t *>(lwe_array_out), size, log_modulus);
 }

+void cuda_modulus_switch_64(void *stream, uint32_t gpu_index, void *lwe_out,
+                            const void *lwe_in, uint32_t size,
+                            uint32_t log_modulus) {
+  host_modulus_switch<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                                static_cast<uint64_t *>(lwe_out),
+                                static_cast<const uint64_t *>(lwe_in), size,
+                                log_modulus);
+}
+
+void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
+                                     void *lwe_out, const void *lwe_in,
+                                     uint32_t lwe_dimension,
+                                     uint32_t log_modulus) {
+  host_centered_modulus_switch_inplace<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_out), static_cast<const uint64_t *>(lwe_in),
+      lwe_dimension, log_modulus);
+}
+
 // This end point is used only for testing purposes
 // its output always follows trivial ordering
 void cuda_improve_noise_modulus_switch_64(
@@ -92,7 +111,7 @@ void cuda_improve_noise_modulus_switch_64(
    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
    uint32_t log_modulus) {
-  host_improve_noise_modulus_switch<uint64_t>(
+  host_drift_modulus_switch<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t const *>(lwe_array_in),
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -1,5 +1,5 @@
 #ifndef CNCRT_CRYPTO_CUH
-#define CNCRT_CRPYTO_CUH
+#define CNCRT_CRYPTO_CUH

 #include "crypto/torus.cuh"
 #include "device.h"
@@ -137,6 +137,34 @@ public:
  }
 };

+// Performs the decomposition for 2_2 params, assumes level_count = 1
+// this specialized version it is needed if we plan to keep everything in regs
+template <typename T, class params, uint32_t base_log>
+__device__ void decompose_and_compress_level_2_2_params(double2 *result,
+                                                        T *state) {
+  constexpr T mask_mod_b = (1ll << base_log) - 1ll;
+  for (int i = 0; i < params::opt / 2; i++) {
+    auto input1 = state[i];
+    auto input2 = state[i + params::opt / 2];
+    T res_re = input1 & mask_mod_b;
+    T res_im = input2 & mask_mod_b;
+
+    input1 >>= base_log; // Update state
+    input2 >>= base_log; // Update state
+
+    T carry_re = ((res_re - 1ll) | input1) & res_re;
+    T carry_im = ((res_im - 1ll) | input2) & res_im;
+    carry_re >>= (base_log - 1);
+    carry_im >>= (base_log - 1);
+
+    res_re -= carry_re << base_log;
+    res_im -= carry_im << base_log;
+
+    typecast_torus_to_double(res_re, result[i].x);
+    typecast_torus_to_double(res_im, result[i].y);
+  }
+}
+
 template <typename Torus>
 __device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
  Torus res = state & mask_mod_b;
@@ -148,4 +176,4 @@ __device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
  return res;
 }

-#endif // CNCRT_CRPYTO_H
+#endif // CNCRT_CRYPTO_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -54,9 +54,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
                           uint32_t polynomial_size, uint32_t level_count,
                           uint32_t max_shared_memory) {
-  if (gpu_count != 1)
-    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
-          "supported yet.")
+  PANIC_IF_FALSE(gpu_count == 1,
+                 "GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
+                 "gpus is not supported yet.",
+                 gpu_count);
+
  cuda_set_device(gpu_indexes[0]);

  int shared_memory_size = sizeof(double) * polynomial_size;
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -124,8 +124,10 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
                           num_blocks_per_sample, num_threads_x);

  int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
-  if (num_blocks_per_sample > 65536)
-    PANIC("Cuda error (Keyswith): number of blocks per sample is too large");
+  PANIC_IF_FALSE(
+      num_blocks_per_sample <= 65536,
+      "Cuda error (Keyswitch): number of blocks per sample (%d) is too large",
+      num_blocks_per_sample);

  // In multiplication of large integers (512, 1024, 2048), the number of
  // samples can be larger than 65536, so we need to set it in the first
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -55,7 +55,7 @@ __global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,
  lwe_out[write_state_idx] = state;
 }

-// Continue decomposiion of an array of Torus elements in place. Supposes
+// Continue decomposition of an array of Torus elements in place. Supposes
 // that the array contains already decomposed elements and
 // computes the new decomposed level in place.
 template <typename Torus>
@@ -202,10 +202,12 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(

  auto stride_KSK_buffer = glwe_accumulator_size * level_count;

-  // Shared memory requirement is 8192 bytes for 64-bit Torus elements
+  // Shared memory requirement is 4096, 8192, and 16384 bytes respectively for
+  // 32, 64, and 128-bit Torus elements We want to keep this as a sanity check
  uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
-  if (shared_mem_size > 8192)
-    PANIC("GEMM kernel error: shared memory required might be too large");
+  // Sanity check: the shared memory size is a constant defined by the algorithm
+  GPU_ASSERT(shared_mem_size <= 1024 * sizeof(Torus),
+             "GEMM kernel error: shared memory required might be too large");

  tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
      num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -73,6 +73,27 @@ __device__ inline void typecast_torus_to_double<__uint128_t>(__uint128_t x,
  r = __ll2double_rn(static_cast<uint64_t>(x));
 }

+// Helper to get signed integer type corresponding to Torus type at compile time
+template <typename Torus> struct signed_torus_type {
+  using clean_t = std::remove_cv_t<Torus>;
+
+  // Compile time check:
+  static_assert(std::is_same<clean_t, uint32_t>::value ||
+                    std::is_same<clean_t, uint64_t>::value ||
+                    std::is_same<clean_t, __uint128_t>::value,
+                "Torus must be uint32_t, uint64_t, or __uint128_t");
+
+  // Type alias (only one will activate)
+  using type = typename std::conditional<
+      std::is_same<clean_t, uint32_t>::value, int32_t,
+      typename std::conditional<std::is_same<clean_t, uint64_t>::value, int64_t,
+                                __int128_t // fallback: we're assuming
+                                           // __uint128_t -> __int128_t
+                                >::type>::type;
+};
+template <typename Torus>
+using signed_torus_t = typename signed_torus_type<Torus>::type;
+
 template <typename T>
 __device__ inline T init_decomposer_state(T input, uint32_t base_log,
                                          uint32_t level_count) {
@@ -91,8 +112,25 @@ __device__ inline T init_decomposer_state(T input, uint32_t base_log,
  return res - (need_balance << rep_bit_count);
 }

+template <typename T, uint32_t base_log, uint32_t level_count>
+__device__ inline T init_decomposer_state_2_2_params(T input) {
+  constexpr T rep_bit_count = level_count * base_log;
+  constexpr T non_rep_bit_count = sizeof(T) * 8 - rep_bit_count;
+  T res = input >> (non_rep_bit_count - 1);
+  T rounding_bit = res & (T)(1);
+  res++;
+  res >>= 1;
+  constexpr T torus_max = scalar_max<T>();
+  constexpr T mod_mask = torus_max >> non_rep_bit_count;
+  res &= mod_mask;
+  T shifted_random = rounding_bit << (rep_bit_count - 1);
+  T need_balance =
+      (((res - (T)(1)) | shifted_random) & res) >> (rep_bit_count - 1);
+  return res - (need_balance << rep_bit_count);
+}
+
 template <typename T>
-__device__ __forceinline__ void modulus_switch(T input, T &output,
+__device__ __forceinline__ void modulus_switch(const T input, T &output,
                                               uint32_t log_modulus) {
  constexpr uint32_t BITS = sizeof(T) * 8;
  output = input + (((T)1) << (BITS - log_modulus - 1));
@@ -107,25 +145,139 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
 }

 template <typename Torus>
-__global__ void modulus_switch_inplace(Torus *array, int size,
+__global__ void modulus_switch_inplace(Torus *array, uint32_t size,
                                       uint32_t log_modulus) {
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid < size) {
    array[tid] = modulus_switch(array[tid], log_modulus);
  }
 }
-
+// Applies the modulus switch on a single LWE
 template <typename Torus>
 __host__ void host_modulus_switch_inplace(cudaStream_t stream,
                                          uint32_t gpu_index, Torus *array,
-                                          int size, uint32_t log_modulus) {
+                                          uint32_t size, uint32_t log_modulus) {
+  cuda_set_device(gpu_index);
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
+  modulus_switch_inplace<Torus>
+      <<<num_blocks, num_threads, 0, stream>>>(array, size, log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__global__ void modulus_switch(Torus *output, const Torus *input, uint32_t size,
+                               uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    output[tid] = modulus_switch(input[tid], log_modulus);
+  }
+}
+// Applies the modulus switch on a single LWE
+template <typename Torus>
+__host__ void host_modulus_switch(cudaStream_t stream, uint32_t gpu_index,
+                                  Torus *output, const Torus *input,
+                                  uint32_t size, uint32_t log_modulus) {
  cuda_set_device(gpu_index);

  int num_threads = 0, num_blocks = 0;
  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);

-  modulus_switch_inplace<Torus>
-      <<<num_blocks, num_threads, 0, stream>>>(array, size, log_modulus);
+  modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
+      output, input, size, log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__device__ __forceinline__ T round_error(T input, uint32_t log_modulus) {
+  T rounded;
+  constexpr uint32_t BITS = sizeof(T) * 8;
+  modulus_switch<T>(input, rounded, log_modulus);
+  rounded <<= (BITS - log_modulus);
+  rounded -= input;
+  return rounded;
+}
+
+// This method is based on rust's
+// core_crypto::centered_binary_ms_body_correction_to_add()
+template <typename T>
+__device__ T centered_binary_modulus_switch_body_correction_to_add(
+    const T *lwe, const uint32_t lwe_dimension, const uint32_t log_modulus) {
+  T sum_half_mask_round_errors = 0;
+  signed_torus_t<T> sum_halving_errors_doubled = 0;
+  constexpr auto TWO = static_cast<signed_torus_t<T>>(2);
+
+  for (auto i = 0; i < lwe_dimension; i++) {
+    auto error = round_error(lwe[i], log_modulus);
+    auto signed_error = static_cast<signed_torus_t<T>>(error);
+    auto half_error = signed_error / TWO;
+
+    // Dividing by 2 can add an error where |error| <= 1/2 in each run of the
+    // loop. Combined, they can add up to more than 1 (in the mod 2^64 torus)
+    // Thus we compute this combined error to reduce it to less than 1/2
+    // half_error = half_error_theoretical + halving_error_doubled/2
+    // where half_error_theoretical * 2 = signed_error
+    auto halving_error_doubled = (half_error * TWO) - signed_error;
+
+    sum_half_mask_round_errors += static_cast<T>(half_error);
+    sum_halving_errors_doubled += halving_error_doubled;
+  }
+
+  auto sum_halving_errors = static_cast<T>(sum_halving_errors_doubled / TWO);
+
+  // sum(half_error_theoretical) = sum(half_error) -
+  // sum(halving_error_doubled)/2
+  sum_half_mask_round_errors -= sum_halving_errors;
+
+  constexpr uint32_t BITS = sizeof(T) * 8;
+  auto half_case = static_cast<T>(1) << (BITS - log_modulus - 1);
+
+  // E(e_MMS) = - sum(mask_round_error / 2)
+  // body_centered = body_input - E(e_MMS) - half_case
+  // body_centered = body_input + sum(mask_round_error / 2) - half_case
+  // body_correction_to_add = sum(mask_round_error / 2) - half_case
+  return sum_half_mask_round_errors - half_case;
+}
+
+template <typename Torus>
+__global__ void centered_modulus_switch(Torus *output, const Torus *input,
+                                        uint32_t lwe_dimension,
+                                        uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tid == lwe_dimension) {
+    auto correction = centered_binary_modulus_switch_body_correction_to_add(
+        input, lwe_dimension, log_modulus);
+
+    auto body = input[lwe_dimension];
+    output[lwe_dimension] = modulus_switch(body + correction, log_modulus);
+  } else {
+    output[tid] = modulus_switch(input[tid], log_modulus);
+  }
+}
+
+// Applies the centered modulus switch on a single LWE
+template <typename Torus>
+__host__ void host_centered_modulus_switch_inplace(
+    cudaStream_t stream, uint32_t gpu_index, Torus *output, const Torus *input,
+    uint32_t lwe_dimension, uint32_t log_modulus) {
+  cuda_set_device(gpu_index);
+  if (input == output)
+    PANIC("Input and Output arrays should be different")
+
+  // Get device properties to check max threads per block
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, gpu_index);
+
+  // Check if lwe_dimension+1 exceeds maximum threads per block
+  if (lwe_dimension + 1 > deviceProp.maxThreadsPerBlock)
+    PANIC("lwe_dimension+1 exceeds maximum number of threads per block")
+
+  // We assume
+  int num_threads = lwe_dimension + 1, num_blocks = 1;
+  centered_modulus_switch<<<num_blocks, num_threads, 0, stream>>>(
+      output, input, lwe_dimension, log_modulus);
  check_cuda_error(cudaGetLastError());
 }

@@ -275,22 +427,21 @@ __global__ void __launch_bounds__(512)
 }

 template <typename Torus>
-__host__ void host_improve_noise_modulus_switch(
+__host__ void host_drift_modulus_switch(
    cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
    Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
    uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
    const double input_variance, const double r_sigma, const double bound,
    uint32_t log_modulus) {

-  if (lwe_size < 512) {
-    PANIC("The lwe_size is less than 512, this is not supported\n");
-    return;
-  }
+  PANIC_IF_FALSE(lwe_size >= 512,
+                 "The lwe_size (%d) is less than 512, this is not supported\n",
+                 lwe_size);
+  PANIC_IF_FALSE(
+      lwe_size <= 1024,
+      "The lwe_size (%d) is greater than 1024, this is not supported\n",
+      lwe_size);

-  if (lwe_size > 1024) {
-    PANIC("The lwe_size is greater than 1024, this is not supported\n");
-    return;
-  }
  cuda_set_device(gpu_index);

  // This reduction requires a power of two num of threads
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -196,14 +196,14 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
    return;
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_dest.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
  cudaPointerAttributes attr_src;
  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_src.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
  cuda_set_device(gpu_index);
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(
@@ -227,14 +227,14 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
    return;
  cudaPointerAttributes attr_dest;
  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_dest.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
  cudaPointerAttributes attr_src;
  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_src.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
  cuda_set_device(gpu_index);
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
--- a/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/fft/bnsmfft.cuh
@@ -63,7 +63,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
  }

  Index twiddle_shift = 1;
-  for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
+  for (Index l = LOG2_DEGREE - 1; l >= 5; --l) {
    Index lane_mask = 1 << (l - 1);
    Index thread_mask = (1 << l) - 1;
    twiddle_shift <<= 1;
@@ -96,8 +96,43 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
      tid = tid + STRIDE;
    }
  }
-  __syncthreads();

+  for (Index l = 4; l >= 1; --l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    twiddle_shift <<= 1;
+
+    tid = threadIdx.x;
+    __syncwarp();
+    double2 reg_A[BUTTERFLY_DEPTH];
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      reg_A[i] = (u_stays_in_register) ? v[i] : u[i];
+      tid = tid + STRIDE;
+    }
+    __syncwarp();
+
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      w = shfl_xor_double2(reg_A[i], 1 << (l - 1), 0xFFFFFFFF);
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+      w = negtwiddles[tid / lane_mask + twiddle_shift];
+
+      w *= v[i];
+
+      v[i] = u[i] - w;
+      u[i] = u[i] + w;
+      tid = tid + STRIDE;
+    }
+  }
+
+  __syncthreads();
  // store registers in SM
  tid = threadIdx.x;
 #pragma unroll
@@ -109,6 +144,119 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
  __syncthreads();
 }

+/*
+ * negacyclic fft optimized for 2_2 params
+   it uses the twiddles from shared memory for extra performance
+   this is possible cause we know for 2_2 params will have memory available
+   the fft is returned in registers to avoid extra synchronizations
+ */
+template <class params>
+__device__ void NSMFFT_direct_2_2_params(double2 *A, double2 *fft_out,
+                                         double2 *shared_twiddles) {
+
+  /* We don't make bit reverse here, since twiddles are already reversed
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance
+   */
+
+  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
+  constexpr Index LOG2_DEGREE = params::log2_degree;
+  constexpr Index HALF_DEGREE = params::degree >> 1;
+  constexpr Index STRIDE = params::degree / params::opt;
+
+  Index tid = threadIdx.x;
+  double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
+
+  // switch register order
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    u[i] = fft_out[i];
+    v[i] = fft_out[i + params::opt / 2];
+  }
+
+  // level 1
+  // we don't make actual complex multiplication on level1 since we have only
+  // one twiddle, it's real and image parts are equal, so we can multiply
+  // it with simpler operations
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    w = v[i] * (double2){0.707106781186547461715008466854,
+                         0.707106781186547461715008466854};
+    v[i] = u[i] - w;
+    u[i] = u[i] + w;
+  }
+
+  Index twiddle_shift = 1;
+  for (Index l = LOG2_DEGREE - 1; l >= 5; --l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    twiddle_shift <<= 1;
+
+    tid = threadIdx.x;
+    __syncthreads();
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      A[tid] = (u_stays_in_register) ? v[i] : u[i];
+      tid = tid + STRIDE;
+    }
+    __syncthreads();
+
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      w = A[tid ^ lane_mask];
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+      w = shared_twiddles[tid / lane_mask + twiddle_shift];
+
+      w *= v[i];
+
+      v[i] = u[i] - w;
+      u[i] = u[i] + w;
+      tid = tid + STRIDE;
+    }
+  }
+
+  for (Index l = 4; l >= 1; --l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    twiddle_shift <<= 1;
+
+    tid = threadIdx.x;
+    double2 reg_A[BUTTERFLY_DEPTH];
+
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      reg_A[i] = (u_stays_in_register) ? v[i] : u[i];
+      w = shfl_xor_double2(reg_A[i], 1 << (l - 1), 0xFFFFFFFF);
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+      w = shared_twiddles[tid / lane_mask + twiddle_shift];
+
+      w *= v[i];
+
+      v[i] = u[i] - w;
+      u[i] = u[i] + w;
+      tid = tid + STRIDE;
+    }
+  }
+
+  // Return result in registers, no need to synchronize here
+  // only with we need to use the same shared memory afterwards
+  for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
+    fft_out[i] = u[i];
+    fft_out[i + params::opt / 2] = v[i];
+  }
+}
+
 /*
 * negacyclic inverse fft
 */
@@ -144,7 +292,46 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  }

  Index twiddle_shift = DEGREE;
-  for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
+  for (Index l = 1; l <= 4; ++l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    tid = threadIdx.x;
+    twiddle_shift >>= 1;
+
+    // at this point registers are ready for the  butterfly
+    tid = threadIdx.x;
+    __syncwarp();
+    double2 reg_A[BUTTERFLY_DEPTH];
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      w = (u[i] - v[i]);
+      u[i] += v[i];
+      v[i] = w * conjugate(negtwiddles[tid / lane_mask + twiddle_shift]);
+
+      // keep one of the register for next iteration and store another one in sm
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      reg_A[i] = (u_stays_in_register) ? v[i] : u[i];
+
+      tid = tid + STRIDE;
+    }
+    __syncwarp();
+
+    // prepare registers for next butterfly iteration
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      w = shfl_xor_double2(reg_A[i], 1 << (l - 1), 0xFFFFFFFF);
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+
+      tid = tid + STRIDE;
+    }
+  }
+
+  for (Index l = 5; l <= LOG2_DEGREE - 1; ++l) {
    Index lane_mask = 1 << (l - 1);
    Index thread_mask = (1 << l) - 1;
    tid = threadIdx.x;
@@ -201,6 +388,126 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  __syncthreads();
 }

+/*
+ * negacyclic inverse fft optimized for 2_2 params
+ * it uses the twiddles from shared memory for extra performance
+ * this is possible cause we know for 2_2 params will have memory available
+ * the input comes from registers to avoid some synchronizations and shared mem
+ * usage
+ */
+template <class params>
+__device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
+                                          double2 *shared_twiddles) {
+
+  /* We don't make bit reverse here, since twiddles are already reversed
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance
+   */
+
+  constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
+  constexpr Index LOG2_DEGREE = params::log2_degree;
+  constexpr Index DEGREE = params::degree;
+  constexpr Index HALF_DEGREE = params::degree >> 1;
+  constexpr Index STRIDE = params::degree / params::opt;
+
+  size_t tid = threadIdx.x;
+  double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
+
+  // load into registers and divide by compressed polynomial size
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    u[i] = buffer_regs[i];
+    v[i] = buffer_regs[i + params::opt / 2];
+
+    u[i] /= DEGREE;
+    v[i] /= DEGREE;
+  }
+
+  Index twiddle_shift = DEGREE;
+  for (Index l = 1; l <= 4; ++l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    tid = threadIdx.x;
+    twiddle_shift >>= 1;
+
+    // at this point registers are ready for the  butterfly
+    tid = threadIdx.x;
+    double2 reg_A[BUTTERFLY_DEPTH];
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      w = (u[i] - v[i]);
+      u[i] += v[i];
+      v[i] = w * conjugate(shared_twiddles[tid / lane_mask + twiddle_shift]);
+
+      tid = tid + STRIDE;
+    }
+    __syncwarp();
+
+    // prepare registers for next butterfly iteration
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      reg_A[i] = (u_stays_in_register) ? v[i] : u[i];
+      w = shfl_xor_double2(reg_A[i], 1 << (l - 1), 0xFFFFFFFF);
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+
+      tid = tid + STRIDE;
+    }
+  }
+
+  for (Index l = 5; l <= LOG2_DEGREE - 1; ++l) {
+    Index lane_mask = 1 << (l - 1);
+    Index thread_mask = (1 << l) - 1;
+    tid = threadIdx.x;
+    twiddle_shift >>= 1;
+
+    // at this point registers are ready for the  butterfly
+    tid = threadIdx.x;
+    __syncthreads();
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      w = (u[i] - v[i]);
+      u[i] += v[i];
+      v[i] = w * conjugate(shared_twiddles[tid / lane_mask + twiddle_shift]);
+
+      // keep one of the register for next iteration and store another one in sm
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      A[tid] = (u_stays_in_register) ? v[i] : u[i];
+
+      tid = tid + STRIDE;
+    }
+    __syncthreads();
+
+    // prepare registers for next butterfly iteration
+    tid = threadIdx.x;
+#pragma unroll
+    for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+      Index rank = tid & thread_mask;
+      bool u_stays_in_register = rank < lane_mask;
+      w = A[tid ^ lane_mask];
+      u[i] = (u_stays_in_register) ? u[i] : w;
+      v[i] = (u_stays_in_register) ? w : v[i];
+
+      tid = tid + STRIDE;
+    }
+  }
+
+// last iteration
+#pragma unroll
+  for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
+    w = (u[i] - v[i]);
+    buffer_regs[i] = u[i] + v[i];
+    buffer_regs[i + params::opt / 2] =
+        w * (double2){0.707106781186547461715008466854,
+                      -0.707106781186547461715008466854};
+  }
+}
+
 /*
 * global batch fft
 * does fft in half size
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -7,12 +7,13 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
+    PBS_TYPE pbs_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, allocate_ms_array);
+                          message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_abs_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
@@ -40,4 +41,6 @@ void cleanup_cuda_integer_abs_inplace(void *const *streams,
  int_abs_buffer<uint64_t> *mem_ptr =
      (int_abs_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -7,12 +7,13 @@ uint64_t scratch_cuda_integer_radix_bitop_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    BITOP_TYPE op_type, bool allocate_gpu_memory, bool allocate_ms_array) {
+    BITOP_TYPE op_type, bool allocate_gpu_memory,
+    PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, allocate_ms_array);
+                          message_modulus, carry_modulus, noise_reduction_type);

  return scratch_cuda_integer_radix_bitop_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
@@ -41,6 +42,8 @@ void cleanup_cuda_integer_bitop(void *const *streams,
  int_bitop_buffer<uint64_t> *mem_ptr =
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
 }

 void update_degrees_after_bitand(uint64_t *output_degrees,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -20,12 +20,15 @@ __host__ void host_integer_radix_bitop_kb(
    void *const *bsks, Torus *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

-  if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
-      lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be equal")
-  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
-      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
-    PANIC("Cuda error: input and output lwe dimension must be equal")
+  PANIC_IF_FALSE(
+      lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
+          lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
+      "Cuda error: input and output num radix blocks must be equal");
+
+  PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
+                     lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
+                 "Cuda error: input and output lwe dimension must be equal");
+
  auto lut = mem_ptr->lut;
  uint64_t degrees[lwe_array_1->num_radix_blocks];
  if (mem_ptr->op == BITOP_TYPE::BITAND) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cu
@@ -23,13 +23,13 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
    uint32_t num_blocks, uint32_t num_additional_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
                          ks_level, ks_base_log, pbs_level, pbs_base_log,
                          grouping_factor, message_modulus, carry_modulus,
-                          allocate_ms_array);
+                          noise_reduction_type);

  return scratch_extend_radix_with_sign_msb<uint64_t>(
      (cudaStream_t *)streams, gpu_indexes, gpu_count,
@@ -43,20 +43,24 @@ void cuda_extend_radix_with_sign_msb_64(
    int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
    void *const *ksks,
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
-
+  PUSH_RANGE("cast")
  host_extend_radix_with_sign_msb<uint64_t>(
      (cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
      num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
+  POP_RANGE()
 }

 void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
                                                uint32_t const *gpu_indexes,
                                                uint32_t gpu_count,
                                                int8_t **mem_ptr_void) {
-
+  PUSH_RANGE("clean cast")
  int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
      (int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);

  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  POP_RANGE()
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -9,9 +9,11 @@ template <typename Torus>
 __host__ void host_extend_radix_with_trivial_zero_blocks_msb(
    CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
    cudaStream_t const *streams, uint32_t const *gpu_indexes) {
+  PUSH_RANGE("extend only")
  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
                                           0, input->num_radix_blocks, input, 0,
                                           input->num_radix_blocks);
+  POP_RANGE()
 }

 template <typename Torus>
@@ -23,10 +25,10 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
  const uint32_t input_start_lwe_index =
      input->num_radix_blocks - output->num_radix_blocks;

-  if (input->num_radix_blocks <= output->num_radix_blocks) {
-    PANIC("Cuda error: input num blocks should be greater than output num "
-          "blocks");
-  }
+  PANIC_IF_FALSE(input->num_radix_blocks > output->num_radix_blocks,
+                 "Cuda error: input num blocks (%d) should be greater than "
+                 "output num blocks (%d)",
+                 input->num_radix_blocks, output->num_radix_blocks);

  copy_radix_ciphertext_slice_async<Torus>(
      streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
@@ -39,13 +41,13 @@ __host__ uint64_t scratch_extend_radix_with_sign_msb(
    uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
    const int_radix_params params, uint32_t num_radix_blocks,
    uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
-
+  PUSH_RANGE("scratch cast/extend")
  uint64_t size_tracker = 0;

  *mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
      streams, gpu_indexes, gpu_count, params, num_radix_blocks,
      num_additional_blocks, allocate_gpu_memory, size_tracker);
-
+  POP_RANGE()
  return size_tracker;
 }

@@ -59,16 +61,16 @@ __host__ void host_extend_radix_with_sign_msb(
    CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

  if (num_additional_blocks == 0) {
+    PUSH_RANGE("cast/extend no addblocks")
    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
                                       input);
+    POP_RANGE()
    return;
  }
-
+  PUSH_RANGE("cast/extend")
  const uint32_t input_blocks = input->num_radix_blocks;

-  if (input_blocks == 0) {
-    PANIC("Cuda error: input blocks cannot be zero");
-  }
+  PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");

  copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
                                           0, input_blocks, input, 0,
@@ -89,6 +91,7 @@ __host__ void host_extend_radix_with_sign_msb(
                                             dst_block_idx, dst_block_idx + 1,
                                             mem_ptr->padding_block, 0, 1);
  }
+  POP_RANGE()
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -7,21 +7,22 @@ uint64_t scratch_cuda_integer_radix_cmux_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory, bool allocate_ms_array) {
+    bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
  PUSH_RANGE("scratch cmux")
  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          big_lwe_dimension, small_lwe_dimension, ks_level,
                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus, allocate_ms_array);
+                          message_modulus, carry_modulus, noise_reduction_type);

  std::function<uint64_t(uint64_t)> predicate_lut_f =
      [](uint64_t x) -> uint64_t { return x == 1; };

-  return scratch_cuda_integer_radix_cmux_kb<uint64_t>(
+  uint64_t ret = scratch_cuda_integer_radix_cmux_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
      lwe_ciphertext_count, params, allocate_gpu_memory);
  POP_RANGE()
+  return ret;
 }

 void cuda_cmux_integer_radix_ciphertext_kb_64(
@@ -49,5 +50,7 @@ void cleanup_cuda_integer_radix_cmux(void *const *streams,
  int_cmux_buffer<uint64_t> *mem_ptr =
      (int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+  delete mem_ptr;
+  *mem_ptr_void = nullptr;
  POP_RANGE()
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Nicolas Sarlin	cd0785ce7a	fix(bench): fix bench params	2025-08-29 17:15:16 +02:00
Pedro Alves	94d24e1f8b	feat(gpu): implement the centered modulus switch technique to classical PBS	2025-08-29 11:38:26 -03:00
Pedro Alves	9a1c0f48f4	feat(gpu): implement 128-bit compression and add it to the integer API	2025-08-29 11:26:07 -03:00
Guillermo Oyarzun	ff29535eb0	feat(gpu): enable specialized pbs for 4_1_1 params	2025-08-29 10:19:45 +02:00
Guillermo Oyarzun	a8f391a442	chore(gpu): update 4_1_1 params to match specialized pbs	2025-08-28 17:54:59 +02:00
Nicolas Sarlin	34743ea304	fix(backward): badly generated backward data	2025-08-28 17:54:59 +02:00
Agnes Leroy	f62e5b3e3b	chore(gpu): fix oom in 4090 tests	2025-08-28 16:12:52 +02:00
Andrei Stoian	6a7244105a	chore(gpu): fix coprocessor bench	2025-08-28 15:45:41 +02:00
Andrei Stoian	50cfb8021a	fix: release sanitizer	2025-08-28 14:21:57 +02:00
Andrei Stoian	c06b513182	chore(gpu): add valgrind and fix leaks	2025-08-28 14:21:57 +02:00
Nicolas Sarlin	677da3855e	chore(ci): update dylint	2025-08-28 08:41:48 +02:00
Nicolas Sarlin	c52e2e32d0	chore(ci): ignore cbor and bcode files in typo checker	2025-08-28 08:41:48 +02:00
Nicolas Sarlin	fa48444611	chore(ci): update toolchain to nightly-2025-08-26	2025-08-28 08:41:48 +02:00
Andrei Stoian	71f427de9e	chore(gpu): add assert macro	2025-08-27 10:32:43 +02:00
Nicolas Sarlin	451458df97	chore(csprng): bump version to 0.7.0	2025-08-26 19:32:40 +02:00
Nicolas Sarlin	44ac59099b	chore(csprng): run clippy without the software-prng feature	2025-08-26 19:32:40 +02:00
Nicolas Sarlin	bafce4657a	fix(csprng): use u64 for ChildrenCount and BytesPerChild	2025-08-26 19:32:40 +02:00
dependabot[bot]	3b5545b7a6	chore(deps): bump codecov/codecov-action from 5.4.3 to 5.5.0 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5.4.3 to 5.5.0. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`18283e04ce...fdcc847654`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-version: 5.5.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-08-26 16:06:07 +02:00
dependabot[bot]	167e96a30c	chore(deps): update dtolnay/rust-toolchain requirement to e97e2d8cc328f1b50210efc529dca0028893a2d9 Updates the requirements on [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) to permit the latest version. - [Release notes](https://github.com/dtolnay/rust-toolchain/releases) - [Commits](`e97e2d8cc3`) --- updated-dependencies: - dependency-name: dtolnay/rust-toolchain dependency-version: e97e2d8cc328f1b50210efc529dca0028893a2d9 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2025-08-26 16:05:48 +02:00
Enzo Di Maria	14063ca3b3	fix(gpu): fix perf of ilog2 backend	2025-08-26 14:53:08 +02:00
David Testé	f8cf613640	chore(ci): update lattice estimator version	2025-08-25 16:35:28 +02:00
Andrei Stoian	a3f8dc6c2a	chore(gpu): add coprocessor benchmarks in tfhe-rs gpu ci	2025-08-25 15:59:45 +02:00
Andrei Stoian	f776c737a1	chore(gpu): fix typos	2025-08-25 10:02:07 +02:00
Guillermo Oyarzun	c1c7fe78ed	fix(gpu): fix memory leak in count consecutive bits	2025-08-22 17:39:32 +02:00
Nicolas Sarlin	cc6b074f6d	chore(core): add check for polynomial size in schoolbook mul	2025-08-22 16:53:56 +02:00
David Testé	4b6942a0f8	chore(bench): add unbounded oprf integer benchmarks Also move Cuda OPRF benchmark into the same file as CPU implementation	2025-08-22 15:01:53 +02:00
Nicolas Sarlin	53da030831	fix(shortint): set correct degree for noise squashed decompressed ct	2025-08-22 09:42:59 +02:00
Nicolas Sarlin	cedcbb99e7	chore(core): add len checks for polynomial lists	2025-08-22 09:42:50 +02:00
Arthur Meyre	58e02e56d1	chore: fix some warnings appearing during compilation with 1.89 - linked to new lints/warnings for elided lifetimes our old nightly toolchain does not know about	2025-08-21 18:13:05 +02:00
Guillermo Oyarzun	827cea966b	chore(gpu): fix nvtx labels and a comment	2025-08-21 18:02:53 +02:00
tmontaigu	d389ea67a1	refactor!: Use NonZero<T> in DataKind Change the type used to store a block count in DataKind to NonZero. This makes it impossible to store 'empty' kinds such as DataKind::Unsigned(0), DataKind::Signed(0). Also, when deserializing, if the count is zero and error will be returned, adding an additional layer of sanitization.	2025-08-21 16:18:28 +02:00
David Testé	0a28488079	chore(ci): add permission to github token to release crates When using crates.io trusted publishing feature GitHub token `id-token: write` permission to be able to authenticate the workflow on the registry.	2025-08-21 09:04:39 +02:00
Nicolas Sarlin	8083990c30	chore(zk): prepare tfhe-zk-pok 0.7.1	2025-08-20 16:47:59 +02:00
Nicolas Sarlin	b67964f4a0	feat(zk): add ZeroizeZp type that is automatically zeroized on drop	2025-08-20 16:47:59 +02:00
David Testé	1647ec8f21	chore(bench): add 2 bits integer to full benchmarks This is done to measure execution time on FheBool equivalent on all operations.	2025-08-19 09:54:03 +02:00
Petar Ivanov	a77c66244c	fix(core): improve FFT and NTT plan cache locking Instead of always write-locking the plan maps first, read-lock them and check if the entry for the given size is present. If not, write-lock and insert it. That reduces contention on the map lock, allowing multiple threads to get an already created plan concurrently, without waiting on the write lock. Furthermore, use a (polynomial size, modulus) key for the NTT plan map, avoiding an issue where the user would get the incorrect plan if a different modulus is used for the same polynomial size.	2025-08-18 16:50:02 +02:00
dependabot[bot]	ce9647d3a9	chore(deps): bump actions/checkout from 4.2.2 to 5.0.0 Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.2 to 5.0.0. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](`11bd71901b...08c6903cd8`) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: 5.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-08-18 14:11:55 +02:00
Nicolas Sarlin	7b7ad5bea0	chore(backward): test noise noise squashing for server key	2025-08-14 11:56:16 +02:00
Nicolas Sarlin	afd628c7b9	doc(backward): explain how to pull backward compat data	2025-08-14 11:56:16 +02:00
Mayeul@Zama	4909a8ef0e	chore(backward): add data for multibit noise squashing	2025-08-14 11:56:16 +02:00
Mayeul@Zama	c8a9105953	chore(backward): add multi bit support	2025-08-14 11:56:16 +02:00
David Testé	b3f1a85e1d	chore(bench): write parameters to disk for hlapi operations	2025-08-13 18:34:26 +02:00
Nicolas Sarlin	5fa8cc8563	fix(core): use of deprecated rayon repeatn	2025-08-13 15:00:15 +02:00
Arthur Meyre	a7dd071bd4	test(shortint): pbs 128 + compression test with new noise measurement	2025-08-13 09:16:49 +02:00
Arthur Meyre	eb6760a7c8	feat(core): add a primitive to build an LweCiphertextList from an Iterator	2025-08-13 09:16:49 +02:00
Arthur Meyre	7f0838270c	chore(core): relax trait requirements for GLWE encryption/decryption	2025-08-13 09:16:49 +02:00
Arthur Meyre	1169096058	chore: fix whitespace in Makefile	2025-08-13 09:16:49 +02:00
Antoniu Pop	9316922e81	fix(benches): fix hlapi dex benchmark transfer function	2025-08-12 17:28:40 +01:00
dependabot[bot]	8ff73f7d73	chore(deps): bump actions/cache from 4.2.3 to 4.2.4 Bumps [actions/cache](https://github.com/actions/cache) from 4.2.3 to 4.2.4. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](`5a3ec84eff...0400d5f644`) --- updated-dependencies: - dependency-name: actions/cache dependency-version: 4.2.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-08-12 15:44:29 +02:00
dependabot[bot]	0c3bda3444	chore(deps): bump actions/download-artifact from 4.3.0 to 5.0.0 Bumps [actions/download-artifact](https://github.com/actions/download-artifact) from 4.3.0 to 5.0.0. - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](`d3f86a106a...634f93cb29`) --- updated-dependencies: - dependency-name: actions/download-artifact dependency-version: 5.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com>	2025-08-12 15:44:22 +02:00
luory ✞	55eade03e6	chore: fix typo in comment section	2025-08-12 15:38:37 +02:00
David Testé	52b1946f25	chore(ci): use crates.io trusted publishing feature	2025-08-12 12:54:57 +02:00
Nicolas Sarlin	bc5c2f51ff	fix(bench): store correct pfail from params	2025-08-12 09:44:37 +02:00
Enzo Di Maria	e5e54be4a4	refactor(gpu): moving unchecked_ilog2_async to the backend	2025-08-12 09:05:29 +02:00
Nicolas Sarlin	0aaadf04d9	chore(versionable): bump version to 0.6.1	2025-08-11 16:49:27 +02:00
Mayeul@Zama	4d1b917045	feat(shortint): add multibit noise squashing	2025-08-11 16:30:59 +02:00
Mayeul@Zama	a85b30a7b2	refactor(shortint): change NoiseSquashingPrivateKeyView fields	2025-08-11 16:30:59 +02:00
Mayeul@Zama	81fa0e43ee	feat(core): add conformance for Fourier128LweMultiBitBootstrapKey	2025-08-11 16:30:59 +02:00
Mayeul@Zama	15bc0c6792	style(core): destructure conformance parameters	2025-08-11 16:30:59 +02:00
Mayeul@Zama	8b5de6d57d	feat(core): add support for InputScalar!=OutputScalar for multi_bit_bootstrap	2025-08-11 16:30:59 +02:00
Nicolas Sarlin	54c6b9e50a	feat(versionable): impl Versionize for Btree{Map, Set}	2025-08-11 13:47:27 +02:00
Arthur Meyre	e31333b2c7	feat: add missing into/from_raw_parts functions for compressed KSK material	2025-08-11 13:02:21 +02:00
Arthur Meyre	37ed32cf4f	chore: fix typo in into_raw_parts function	2025-08-11 13:02:21 +02:00
Guillermo Oyarzun	4a3be71bd7	fix(gpu): create message extract lut only when needed	2025-08-11 10:38:31 +02:00
Arthur Meyre	a63207af9e	chore(ci): add MSRV build to check we are compliant with what we announce - have to downgrade param_dedup edition as 1.84 cannot handle 2024 in a workspace	2025-08-08 18:06:29 +02:00
Arthur Meyre	4c4c7a47a5	chore(ci): remove old backward compat mechanism for branch fetching - nowadays backward compat data is directly in the repo which made the old mechanism obsolete	2025-08-08 18:06:29 +02:00
Arthur Meyre	dbc3924989	chore(ci): enable extended types in the docs.rs build	2025-08-08 18:06:29 +02:00
Arthur Meyre	04d4ccc16c	chore(ci): remove TFHE_SPEC from Makefile - this is a leftover from a complicated attempt at backward compatibility no need to keep this	2025-08-08 18:06:29 +02:00
Arthur Meyre	9d4a9fe71e	chore: check packing is possible before packing in integer noise squashing	2025-08-08 10:35:16 +02:00
David Testé	3b42f9873a	chore(bench): write params to file for each zk benchmark on gpu To be parsable each benchmark criterion ID must have their crypto details written to a file.	2025-08-07 15:17:33 +02:00
pgardratzama	afd8f58a8d	feat(hpu): update backend to support multiple V80 device, id of v80 is its serial number - update psi64 to replace fw with stable version (3.1.0), remove psi16.hpu	2025-08-07 14:58:39 +02:00
Guillermo Oyarzun	1b92bcf476	feat(gpu): extra optimizations for 2_2 params kernels and bugs fixes	2025-08-07 09:34:32 +02:00
Guillermo Oyarzun	79d5db66d4	feat(gpu): use warp level optimizations for fft	2025-08-07 09:34:32 +02:00
Guillermo Oyarzun	d741e55218	feat(gpu): write specialized pbs keybundle for 2_2 params	2025-08-07 09:34:32 +02:00
Guillermo Oyarzun	ef5a391dc2	feat(gpu): write specialized pbs accumulate for 2_2 params	2025-08-07 09:34:32 +02:00
Enzo Di Maria	d1c417bf71	refactor(gpu): cleaning compression	2025-08-07 09:31:55 +02:00
Arthur Meyre	46a7229c81	chore: fix minimum version for cargo check - this only works if the current major is the major we expect	2025-08-05 17:30:07 +02:00
Enzo Di Maria	852a06b330	refactor(gpu): orpf with grouped processing and for multi-gpu	2025-08-05 09:58:25 +02:00
Guillermo Oyarzun	ea200c3548	chore(gpu): enable nvidia mps in long run tests	2025-08-04 16:18:48 +02:00
Arthur Meyre	1a1b88362c	chore: fix noise checks timeout again as there are TWO timeout locations	2025-08-04 09:48:39 +02:00
Mayeul@Zama	fe2dde0e0c	chore(gpu): fix index type	2025-08-01 10:38:09 +02:00
Afounso Souza	e7e095b924	fix(gpu): fix typo fix(gpu): fix typo	2025-08-01 10:21:54 +02:00
Andrei Stoian	7bf2ec6ff2	chore(gpu): fix warnings detection	2025-07-31 18:47:08 +02:00
Agnes Leroy	2d7e1b2293	chore(gpu): change active gpu count logic	2025-07-31 16:10:45 +01:00
Andrei Stoian	79aeeca3b2	fix(gpu): install gcc requested by the workflow	2025-07-31 15:52:39 +01:00
Andrei Stoian	e89d2f8b05	fix(gpu): install gcc requested by the workflow	2025-07-31 15:52:39 +01:00
Andrei Stoian	0b3ea4be9e	fix(gpu): install gcc requested by the workflow	2025-07-31 15:52:39 +01:00
Andrei Stoian	68a7520e73	fix(gpu): install gcc requested by the workflow	2025-07-31 15:52:39 +01:00
Guillermo Oyarzun	a411e5720d	fix(gpu): update soon deprecated version nvtx	2025-07-31 16:52:05 +02:00
Mayeul@Zama	5ee5569d0d	chore: remove redundant gpu feature gate	2025-07-31 14:11:33 +01:00
Agnes Leroy	54d038ef30	chore(gpu): enhance scatter to check gpu count is ok	2025-07-31 13:11:52 +01:00
Agnes Leroy	b6e6abb066	chore(gpu): add corner case test for mul	2025-07-31 13:11:29 +01:00
Arthur Meyre	82a5cc7f2d	chore(ci): increase timeout for noise checks	2025-07-31 12:00:15 +02:00
Guillermo Oyarzun	908922171d	fix(gpu): remove unused pointer in squash and add some extra checks	2025-07-31 09:52:34 +01:00
Kendra Karol Sevilla	84f6a8082d	fix(cuda): correct radix block mismatch check in LWE array validation	2025-07-31 09:28:48 +01:00
otc group	0bc59dca59	chore: fix typo in comment chore: fix typo in comment	2025-07-31 09:20:49 +01:00
Agnes Leroy	09ffc39b15	fix(gpu): fix inconsistent types	2025-07-31 08:14:45 +01:00
swarnabhasinha	099345df02	fix(api): Add min/max on owned types	2025-07-30 21:33:58 +02:00
Agnes Leroy	48c10e91f7	chore(gpu): fix gpu setup action for ci	2025-07-30 14:22:13 +01:00
Andrei Stoian	36eceaf05e	feat(gpu): utility debug workflows in ci	2025-07-30 12:55:40 +01:00
Arthur Meyre	e8986cbd7c	chore: setup CI for noise checks	2025-07-29 15:29:24 +02:00
Arthur Meyre	fc8063a59b	test: add noise simulation framework for basic operators - test secret key encryption + start of compute atomic pattern in shortint - only supports classic PBS with drift mitigation currently	2025-07-29 15:29:24 +02:00
Arthur Meyre	65b034ef70	chore(core): update noise formulas	2025-07-29 15:29:24 +02:00
cryptoraph	2aa83c99ea	fix(core_crypto): correct typo in GLWE keyswitch assertion message	2025-07-28 17:09:56 +02:00
cryptoraph	d78266e141	fix(cuda): correct typo in keyswitch error message	2025-07-28 17:09:56 +02:00
Pedro Alves	62e6504ef0	fix(gpu): fixes some wrong indexes used in cuda_set_device()	2025-07-28 12:43:13 +01:00
bigbear	209a8f1ad9	fix: correct typo in InvalidRangeError message	2025-07-25 17:18:57 +02:00
Guillermo Oyarzun	3621dd1ae7	chore(gpu): correct pfail in readme	2025-07-25 16:46:03 +02:00
Guillermo Oyarzun	b5a7199c15	chore(gpu): update cuda version in ci	2025-07-25 15:57:25 +02:00
Mayeul@Zama	09aaa4e045	chore: enable unused_imports lint in doctests	2025-07-23 11:09:23 +02:00
Mayeul@Zama	63203c58aa	refactor(shortint): cleanup decompression key	2025-07-23 10:08:01 +02:00
Mayeul@Zama	03f8a134b3	chore: remove unused gitignore entry	2025-07-21 10:15:56 +02:00
Maximilian Hubert	981da1d3fc	fix: fix typo	2025-07-18 18:16:52 +02:00
Nicolas Sarlin	0386090048	chore: missing from/into_raw_parts for noise squash comp priv key	2025-07-18 13:34:23 +02:00