chore(gpu): replace cudaStreamCaptureModeGlobal by cudaStreamCaptureModeThreadLocal to avoid CPU multi-thread issues

feat(gpu): implement CUDA Graph to accelerate default classical and
multibit PBS
2026-04-28 03:01:21 -04:00 · 2025-01-06 18:01:21 -03:00 · 2025-01-06 17:29:48 -03:00 · 2025-01-06 18:17:27 +01:00 · 2025-01-06 16:54:40 +01:00 · 2025-01-06 13:09:43 +01:00
358 changed files with 14879 additions and 4152 deletions
--- a/.github/actions/hyperstack_setup/action.yml
+++ b/.github/actions/hyperstack_setup/action.yml
@@ -0,0 +1,53 @@
+name: Setup Cuda
+description: Setup Cuda on Hyperstack instance
+
+inputs:
+  cuda-version:
+    description: Version of Cuda to use
+    required: true
+  gcc-version:
+    description: Version of GCC to use
+    required: true
+  cmake-version:
+    description: Version of cmake to use
+    default: 3.29.6
+
+runs:
+  using: "composite"
+  steps:
+    # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+    - name: Install dependencies
+      shell: bash
+      run: |
+        sudo apt update
+        sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+        wget https://github.com/Kitware/CMake/releases/download/v${{ inputs.cmake-version }}/cmake-${{ inputs.cmake-version }}.tar.gz
+        tar -zxvf cmake-${{ inputs.cmake-version }}.tar.gz
+        cd cmake-${{ inputs.cmake-version }}
+        ./bootstrap
+        make -j"$(nproc)"
+        sudo make install
+
+    - name: Export CUDA variables
+      shell: bash
+      run: |
+        CUDA_PATH=/usr/local/cuda-${{ inputs.cuda-version }}
+        echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+        echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+        echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+        echo "CUDACXX=/usr/local/cuda-${{ inputs.cuda-version }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+    # Specify the correct host compilers
+    - name: Export gcc and g++ variables
+      shell: bash
+      run: |
+        {
+          echo "CC=/usr/bin/gcc-${{ inputs.gcc-version }}";
+          echo "CXX=/usr/bin/g++-${{ inputs.gcc-version }}";
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ inputs.gcc-version }}";
+          echo "HOME=/home/ubuntu";
+        } >> "${GITHUB_ENV}"
+
+    - name: Check device is detected
+      shell: bash
+      run: nvidia-smi
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -100,7 +100,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -11,16 +11,26 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+  pull_request_target:

 jobs:
+  check-user-permission:
+    if: github.event_name == 'pull_request_target'
+    uses: ./.github/workflows/check_triggering_actor.yml
+    secrets:
+      TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
  should-run:
    runs-on: ubuntu-latest
+    needs: check-user-permission
+    if: github.event_name != 'pull_request_target' ||
+      needs.check-user-permission.result == 'success'
    permissions:
      pull-requests: write
    outputs:
@@ -55,10 +65,11 @@ jobs:
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          ref: ${{ github.event.pull_request.head.sha }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -133,7 +144,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -148,7 +159,7 @@ jobs:
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    needs: [ should-run, setup-instance ]
    concurrency:
-      group: ${{ github.workflow }}_${{ github.ref }}
+      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -157,9 +168,10 @@ jobs:
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          ref: ${{ github.event.pull_request.head.sha }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -270,7 +282,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -140,7 +140,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -144,7 +144,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -67,7 +67,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -142,7 +142,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -250,7 +250,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -119,7 +119,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -93,7 +93,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
@@ -127,7 +127,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -58,7 +58,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -81,7 +81,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -115,7 +115,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -97,7 +97,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_erc20
          path: ${{ env.RESULTS_FILENAME }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -54,7 +54,7 @@ jobs:
          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -82,7 +82,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -127,7 +127,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -157,7 +157,7 @@ jobs:
      

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,28 +48,19 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Get benchmark details
        run: |
          {
@@ -84,31 +75,10 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
      - name: Run benchmarks with AVX512
        run: |
          make bench_pbs_gpu
@@ -128,7 +98,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -167,7 +137,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -12,7 +12,10 @@ on:
          - "l40 (n3-L40x1)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100x8-SXM5)"

 jobs:
  parse-inputs:
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -54,7 +54,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -75,28 +75,19 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Get benchmark details
        run: |
          {
@@ -111,34 +102,10 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run benchmarks
        run: |
          make bench_hlapi_erc20_gpu
@@ -157,9 +124,9 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
-          name: ${{ github.sha }}_erc20
+          name: ${{ github.sha }}_erc20_${{ inputs.profile }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -196,7 +163,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer.yml
+++ b/.github/workflows/benchmark_gpu_integer.yml
@@ -15,6 +15,7 @@ on:
          - "4-h100 (n3-H100x4)"
          - "multi-h100 (n3-H100x8)"
          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
          - "multi-a100-nvlink (n3-A100x8-NVLink)"
      command:
        description: "Benchmark command to run"
--- a/.github/workflows/benchmark_gpu_integer_common.yml
+++ b/.github/workflows/benchmark_gpu_integer_common.yml
@@ -118,7 +118,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -145,28 +145,19 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Get benchmark details
        run: |
          {
@@ -181,41 +172,10 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-          } >> "${GITHUB_ENV}"
-
-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
        run: |
@@ -240,11 +200,18 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}
          path: ${{ env.RESULTS_FILENAME }}

+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
      - name: Send data to Slab
        shell: bash
        run: |
@@ -272,7 +239,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -90,7 +90,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -170,7 +170,7 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -197,7 +197,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -56,7 +56,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -136,7 +136,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -163,7 +163,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -90,7 +90,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -164,7 +164,7 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -191,7 +191,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -32,7 +32,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,7 +84,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_fft
          path: ${{ env.RESULTS_FILENAME }}
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -32,7 +32,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,7 +84,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_ntt
          path: ${{ env.RESULTS_FILENAME }}
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -36,7 +36,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -58,7 +58,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +91,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -121,7 +121,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_tfhe_zk_pok
          path: ${{ env.RESULTS_FILENAME }}
@@ -155,7 +155,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -40,7 +40,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -166,7 +166,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -200,7 +200,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -104,7 +104,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -141,7 +141,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: nightly

@@ -177,7 +177,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
+        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
        with:
          name: ${{ github.sha }}_integer_zk
          path: ${{ env.RESULTS_FILENAME }}
@@ -211,7 +211,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -28,7 +28,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

--- a/.github/workflows/check_triggering_actor.yml
+++ b/.github/workflows/check_triggering_actor.yml
@@ -0,0 +1,29 @@
+# Check if triggering actor is a collaborator and has write access
+name: Check Triggering Actor
+
+on:
+  workflow_call:
+    secrets:
+      TOKEN:
+        required: true
+
+jobs:
+  check-actor-permission:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get User Permission
+        id: check-access
+        uses: actions-cool/check-user-permission@956b2e73cdfe3bcb819bb7225e490cb3b18fd76e # v2.2.1
+        with:
+          require: write
+          username: ${{ github.triggering_actor }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.TOKEN }}
+
+      - name: Check User Permission
+        if: steps.check-access.outputs.require-result == 'false'
+        run: |
+          echo "${{ github.triggering_actor }} does not have permissions on this repo."
+          echo "Current permission level is ${{ steps.check-access.outputs.user-permission }}"
+          echo "Job originally triggered by ${{ github.actor }}"
+          exit 1
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -27,7 +27,7 @@ jobs:
          make lint_workflow

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@5d6ac37a4cef8b8df67f482a8e384987766f0213 # v3.0.17
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@6ae615f6475d2ede5ad88bea6baa7a1d5e93ffaa # v3.0.19
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,13 +47,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          files_yaml: |
            tfhe:
@@ -83,7 +83,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7f8b4b4bde536c465e797be725718b88c5d95e0e
+        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@7f8b4b4bde536c465e797be725718b88c5d95e0e
+        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -121,7 +121,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -40,7 +40,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,60 +94,28 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run core crypto and internal CUDA backend tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
@@ -187,7 +155,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -66,7 +66,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -92,60 +92,28 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
@@ -185,7 +153,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,9 +49,6 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
@@ -71,38 +68,21 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu
@@ -139,7 +119,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,60 +94,28 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run multi-bit CUDA integer compression tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
@@ -190,7 +158,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -1,4 +1,4 @@
-name: AWS Long Run Tests on GPU
+name: Long Run Tests on GPU

 env:
  CARGO_TERM_COLOR: always
@@ -15,8 +15,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Weekly tests will be triggered each Friday at 1a.m.
-    - cron: '0 1 * * FRI'
+    # Weekly tests will be triggered each Friday at 9p.m.
+    - cron: "0 21 * * 5"

 jobs:
  setup-instance:
@@ -29,17 +29,17 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: 2-h100
+          profile: multi-gpu-test

  cuda-tests:
-    name: Long run GPU H100 tests
+    name: Long run GPU tests
    needs: [ setup-instance ]
    concurrency:
      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
@@ -53,57 +53,26 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
+    timeout-minutes: 4320 # 72 hours
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run tests
        run: |
          make test_integer_long_run_gpu
@@ -119,7 +88,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests.result }}
-          SLACK_MESSAGE: "Integer GPU H100 long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer GPU long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (gpu-tests)
@@ -129,7 +98,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -110,7 +110,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,58 +94,25 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run signed integer tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
@@ -171,7 +138,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,58 +94,25 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run signed integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
@@ -171,7 +138,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -42,7 +42,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -101,57 +101,28 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
      - name: Should run nightly tests
        if: github.event_name == 'schedule'
        run: |
@@ -160,10 +131,6 @@ jobs:
            echo "NIGHTLY_TESTS=TRUE";
          } >> "${GITHUB_ENV}"

-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run signed integer multi-bit tests
        run: |
          make test_signed_integer_multi_bit_gpu_ci
@@ -189,7 +156,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,58 +94,25 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run unsigned integer tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
@@ -171,7 +138,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,58 +94,25 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run unsigned integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
@@ -171,7 +138,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@bab30c2299617f6615ec02a68b9a40d10bd21366
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -74,7 +74,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,54 +100,25 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-      CMAKE_VERSION: 3.29.6
    steps:
-      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
-          cd cmake-${{ env.CMAKE_VERSION }}
-          ./bootstrap
-          make -j"$(nproc)"
-          sudo make install
-
-
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

+      - name: Setup Hyperstack dependencies
+        uses: ./.github/actions/hyperstack_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
      - name: Should run nightly tests
        if: github.event_name == 'schedule'
        run: |
@@ -156,10 +127,6 @@ jobs:
            echo "NIGHTLY_TESTS=TRUE";
          } >> "${GITHUB_ENV}"

-      - name: Check device is detected
-        if: ${{ !cancelled() }}
-        run: nvidia-smi
-
      - name: Run unsigned integer multi-bit tests
        run: |
          make test_unsigned_integer_multi_bit_gpu_ci
@@ -185,7 +152,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -15,8 +15,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Weekly tests will be triggered each Friday at 1a.m.
-    - cron: '0 1 * * FRI'
+    # Weekly tests will be triggered each Friday at 9p.m.
+    - cron: "0 21 * * 5"

 jobs:
  setup-instance:
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,6 +45,7 @@ jobs:
      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -53,7 +54,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -77,7 +78,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -39,7 +39,7 @@ jobs:
          persist-credentials: "false"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -50,7 +50,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe
-      - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -36,7 +36,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
+        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
          toolchain: stable

@@ -120,7 +120,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,11 +19,14 @@ exclude = [
    "utils/cargo-tfhe-lints"
 ]
 [workspace.dependencies]
-aligned-vec = { version = "0.5", default-features = false }
+aligned-vec = { version = "0.6", default-features = false }
 bytemuck = "1.14.3"
-dyn-stack = { version = "0.10", default-features = false }
+dyn-stack = { version = "0.11", default-features = false }
+itertools = "0.13"
 num-complex = "0.4"
-pulp = { version = "0.18.22", default-features = false }
+pulp = { version = "0.20.0", default-features = false }
+rand = "0.8"
+rayon = "1"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = ">=0.2.86,<0.2.94"

--- a/238
+++ b/238
@@ -2,7 +2,6 @@ SHELL:=$(shell /usr/bin/env which bash)
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
-TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
 CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -25,6 +24,7 @@ BACKWARD_COMPAT_DATA_BRANCH?=v0.4
 BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
+WASM_PACK_VERSION="0.13.1"
 # We are kind of hacking the cut here, the version cannot contain a quote '"'
 WASM_BINDGEN_VERSION:=$(shell grep '^wasm-bindgen[[:space:]]*=' Cargo.toml | cut -d '"' -f 2 | xargs)
 WEB_RUNNER_DIR=web-test-runner
@@ -116,8 +116,8 @@ install_wasm_bindgen_cli: install_rs_build_toolchain

 .PHONY: install_wasm_pack # Install wasm-pack to build JS packages
 install_wasm_pack: install_rs_build_toolchain
-	@wasm-pack --version > /dev/null 2>&1 || \
-	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked wasm-pack@0.13.0 || \
+	@wasm-pack --version | grep "$(WASM_PACK_VERSION)" > /dev/null 2>&1 || \
+	cargo $(CARGO_RS_BUILD_TOOLCHAIN) install --locked wasm-pack@0.13.1 || \
 	( echo "Unable to install cargo wasm-pack, unknown error." && exit 1 )

 .PHONY: install_node # Install last version of NodeJS via nvm
@@ -281,14 +281,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu \
 		--all-targets \
 		-p $(TFHE_SPEC)

@@ -307,52 +307,51 @@ lint_workflow: check_actionlint_installed
 .PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE) \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),experimental \
+		--features=experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
+		--features=nightly-avx512 \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
+		--features=experimental,nightly-avx512 \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),zk-pok \
+		--features=zk-pok \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),boolean \
+		--features=boolean \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_shortint # Run clippy lints enabling the shortint features
 clippy_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),shortint \
+		--features=shortint \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
+		--features=shortint,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),zk-pok,shortint \
+		--features=zk-pok,shortint \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_integer # Run clippy lints enabling the integer features
 clippy_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),integer \
+		--features=integer \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),integer,experimental \
+		--features=integer,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
 clippy: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
+		--features=boolean,shortint,integer \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_rustdoc # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
@@ -363,13 +362,13 @@ clippy_rustdoc: install_rs_check_toolchain
 	fi && \
 	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats,strings \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings \
 		-p $(TFHE_SPEC)

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
 clippy_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
@@ -394,17 +393,16 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
 clippy_tfhe_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=$(TARGET_ARCH_FEATURE) \
-		-p tfhe-csprng -- --no-deps -D warnings
+		--features=parallel,software-prng -p tfhe-csprng -- --no-deps -D warnings

 .PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
 clippy_zk_pok: install_rs_check_toolchain
@@ -443,67 +441,67 @@ check_rust_bindings_did_not_change:
 .PHONY: tfhe_lints # Run custom tfhe-rs lints
 tfhe_lints: install_tfhe_lints
 	cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok -- -D warnings
+		--features=boolean,shortint,integer,zk-pok -- -D warnings

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p $(TFHE_SPEC)
+		-p $(TFHE_SPEC)
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),nightly-avx512 -p $(TFHE_SPEC); \
+			--features=nightly-avx512 -p $(TFHE_SPEC); \
 	fi

 .PHONY: build_core_experimental # Build core_crypto with experimental features
 build_core_experimental: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC)
+		--features=experimental -p $(TFHE_SPEC)
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 -p $(TFHE_SPEC); \
+			--features=experimental,nightly-avx512 -p $(TFHE_SPEC); \
 	fi

 .PHONY: build_boolean # Build with boolean enabled
 build_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) --all-targets
+		--features=boolean -p $(TFHE_SPEC) --all-targets

 .PHONY: build_shortint # Build with shortint enabled
 build_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),shortint -p $(TFHE_SPEC) --all-targets
+		--features=shortint -p $(TFHE_SPEC) --all-targets

 .PHONY: build_integer # Build with integer enabled
 build_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) --all-targets
+		--features=integer -p $(TFHE_SPEC) --all-targets

 .PHONY: build_tfhe_full # Build with boolean, shortint and integer enabled
 build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
+		--features=boolean,shortint,integer -p $(TFHE_SPEC) --all-targets

 .PHONY: build_tfhe_coverage # Build with test coverage enabled
 build_tfhe_coverage: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
+		--features=boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests

 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok \
 		-p $(TFHE_SPEC)

 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,gpu \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,gpu \
 		-p $(TFHE_SPEC)

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
 		-p $(TFHE_SPEC)

 .PHONY: build_web_js_api # Build the js API targeting the web browser
@@ -534,15 +532,15 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
 .PHONY: build_tfhe_csprng # Build tfhe_csprng
 build_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng --all-targets
+		-p tfhe-csprng --all-targets

 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
+		--features=experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok,nightly-avx512 -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=experimental,zk-pok,nightly-avx512 -p $(TFHE_SPEC) -- core_crypto::; \
 	fi

 .PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -550,13 +548,13 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
 		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
+		--features=experimental,internal-keycache \
 		-p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
 			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,nightly-avx512 \
+			--features=experimental,internal-keycache,nightly-avx512 \
 			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
 	fi

@@ -574,35 +572,38 @@ test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=gpu -p $(TFHE_SPEC) -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
+		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
+		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

-.PHONY: test_integer_long_run_gpu # Run the tests of the integer module including experimental on the gpu backend
-test_integer_long_run_gpu: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu,__long_run_tests -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run --test-threads=6
+.PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
+test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
+	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
+	LONG_TESTS=TRUE \
+		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
+		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)" --backend "gpu"

 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compressed_ciphertext_list::tests::
+		--features=integer -p $(TFHE_SPEC) -- integer::ciphertext::compressed_ciphertext_list::tests::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compress
+		--features=integer -p $(TFHE_SPEC) -- integer::ciphertext::compress

 .PHONY: test_integer_compression_gpu
 test_integer_compression_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
+		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
+		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress

 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -661,20 +662,20 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) -- boolean::
+		--features=boolean -p $(TFHE_SPEC) -- boolean::

 .PHONY: test_boolean_cov # Run the tests of the boolean module with code coverage
 test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
+		--features=boolean,internal-keycache \
 		-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::

 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
+		--features=boolean-c-api,shortint-c-api,high-level-c-api \
 		-p $(TFHE_SPEC) \
 		c_api

@@ -706,14 +707,14 @@ test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 .PHONY: test_shortint # Run all the tests for shortint
 test_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p $(TFHE_SPEC) -- shortint::
+		--features=shortint,internal-keycache -p $(TFHE_SPEC) -- shortint::

 .PHONY: test_shortint_cov # Run the tests of the shortint module with code coverage
 test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
+		--features=shortint,internal-keycache \
 		-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::

 .PHONY: test_integer_ci # Run the tests for integer ci
@@ -770,26 +771,28 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

-.PHONY: test_integer_long_run # Run the long run tests for integer
-test_integer_long_run: install_rs_build_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-						--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,__long_run_tests -p $(TFHE_SPEC) -- integer::server_key::radix_parallel::tests_long_run
-
+.PHONY: test_integer_long_run # Run the long run integer tests
+test_integer_long_run: install_rs_check_toolchain install_cargo_nextest
+	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
+	LONG_TESTS=TRUE \
+		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
+		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
+		--tfhe-package "$(TFHE_SPEC)"

 .PHONY: test_safe_serialization # Run the tests for safe serialization
 test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_serialization::
+		--features=boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_serialization::

 .PHONY: test_zk # Run the tests for the zk module of the TFHE-rs crate
 test_zk: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,zk-pok -p $(TFHE_SPEC) -- zk::
+		--features=shortint,zk-pok -p $(TFHE_SPEC) -- zk::

 .PHONY: test_integer # Run all the tests for integer
 test_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::
+		--features=integer,internal-keycache -p $(TFHE_SPEC) -- integer::

 .PHONY: test_integer_cov # Run the tests of the integer module with code coverage
 test_integer_cov: install_rs_check_toolchain install_tarpaulin
@@ -797,38 +800,38 @@ test_integer_cov: install_rs_check_toolchain install_tarpaulin
 		--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
 		--implicit-test-threads \
 		--exclude-files $(COVERAGE_EXCLUDED_FILES) \
-		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
+		--features=integer,internal-keycache \
 		-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::

 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok -p $(TFHE_SPEC) \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok -p $(TFHE_SPEC) \
 		-- high_level_api::

 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
 		-E "test(/high_level_api::.*gpu.*/)"

 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),shortint,integer,strings -p $(TFHE_SPEC) \
+		--features=shortint,integer,strings -p $(TFHE_SPEC) \
 		-- strings::


 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
+		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p $(TFHE_SPEC) \
 		-- test_user_docs::

 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
+		--features=boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
 		-- test_user_docs::


@@ -836,14 +839,12 @@ test_user_doc_gpu: install_rs_build_toolchain
 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--example regex_engine \
-		--features=$(TARGET_ARCH_FEATURE),integer
+		--example regex_engine --features=integer

 .PHONY: test_sha256_bool # Run tests for sha256_bool example
 test_sha256_bool: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--example sha256_bool \
-		--features=$(TARGET_ARCH_FEATURE),boolean
+		--example sha256_bool --features=boolean

 .PHONY: test_examples # Run tests for examples
 test_examples: test_sha256_bool test_regex_engine
@@ -861,7 +862,7 @@ test_kreyvium: install_rs_build_toolchain
 .PHONY: test_tfhe_csprng # Run tfhe-csprng tests
 test_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng
+		-p tfhe-csprng

 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
@@ -879,7 +880,7 @@ test_zk_wasm_x86_compat_ci: check_nvm_installed
 test_zk_wasm_x86_compat: install_rs_build_toolchain build_node_js_api
 	cd tfhe/tests/zk_wasm_x86_test && npm install
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe --test zk_wasm_x86_test --features=$(TARGET_ARCH_FEATURE),integer,zk-pok
+		-p tfhe --test zk_wasm_x86_test --features=integer,zk-pok

 .PHONY: test_versionable # Run tests for tfhe-versionable subcrate
 test_versionable: install_rs_build_toolchain
@@ -892,7 +893,7 @@ test_versionable: install_rs_build_toolchain
 test_backward_compatibility_ci: install_rs_build_toolchain
 	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
-		--features=$(TARGET_ARCH_FEATURE),shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
+		--features=shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
 test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
@@ -907,7 +908,7 @@ doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)
+		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)

 .PHONY: docs # Build rust doc alias for doc
 docs: doc
@@ -918,7 +919,7 @@ lint_doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps
+		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -947,7 +948,7 @@ check_md_links: install_mlc
 .PHONY: check_compile_tests # Build tests in debug without running them
 check_compile_tests: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
+		--features=experimental,boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC)

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
@@ -958,7 +959,7 @@ check_compile_tests: install_rs_build_toolchain
 .PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
 check_compile_tests_benches_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
+		--features=experimental,boolean,shortint,integer,internal-keycache,gpu \
 		-p $(TFHE_SPEC)
 	mkdir -p "$(TFHECUDA_BUILD)" && \
 		cd "$(TFHECUDA_BUILD)" && \
@@ -1037,42 +1038,42 @@ dieharder_csprng: install_dieharder build_tfhe_csprng
 .PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
 print_doc_bench_parameters:
 	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe
+	--features=shortint,internal-keycache -p tfhe

 .PHONY: bench_integer # Run benchmarks for unsigned integer
 bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -1080,7 +1081,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1088,7 +1089,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1096,7 +1097,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1104,14 +1105,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
+	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,zk-pok,nightly-avx512 \
+	--features=integer,internal-keycache,zk-pok,nightly-avx512 \
 	-p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
@@ -1119,14 +1120,14 @@ bench_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_shortint_oprf # Run benchmarks for shortint
 bench_shortint_oprf: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
@@ -1134,43 +1135,43 @@ bench_shortint_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench boolean-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_pbs # Run benchmarks for PBS
 bench_pbs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_pbs128 # Run benchmarks for PBS using FFT 128 bits
 bench_pbs128: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs128-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
 bench_pbs_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_FAST_BENCH=$(FAST_BENCH) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_ks # Run benchmarks for keyswitch
 bench_ks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench ks-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
 bench_ks_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench ks-bench \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
 bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
@@ -1206,13 +1207,13 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 bench_hlapi_erc20: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ECR20 operations on GPU
 bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
@@ -1227,32 +1228,32 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 gen_key_cache: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 		--example generates_test_keys \
-		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,experimental,internal-keycache -p $(TFHE_SPEC) \
+		--features=boolean,shortint,experimental,internal-keycache -p $(TFHE_SPEC) \
 		-- $(MULTI_BIT_ONLY) $(COVERAGE_ONLY)

 .PHONY: gen_key_cache_core_crypto # Run function to generate keys and cache them for core_crypto tests
 gen_key_cache_core_crypto: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --tests --profile $(CARGO_PROFILE) \
-		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache -p $(TFHE_SPEC) -- --nocapture \
+		--features=experimental,internal-keycache -p $(TFHE_SPEC) -- --nocapture \
 		core_crypto::keycache::generate_keys

 .PHONY: measure_hlapi_compact_pk_ct_sizes # Measure sizes of public keys and ciphertext for high-level API
 measure_hlapi_compact_pk_ct_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example hlapi_compact_pk_ct_sizes \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache
+	--features=integer,internal-keycache

 .PHONY: measure_shortint_key_sizes # Measure sizes of bootstrapping and key switching keys for shortint
 measure_shortint_key_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example shortint_key_sizes \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache
+	--features=shortint,internal-keycache

 .PHONY: measure_boolean_key_sizes # Measure sizes of bootstrapping and key switching keys for boolean
 measure_boolean_key_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example boolean_key_sizes \
-	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache
+	--features=boolean,internal-keycache

 .PHONY: parse_integer_benches # Run python parser to output a csv containing integer benches data
 parse_integer_benches:
@@ -1264,14 +1265,13 @@ parse_integer_benches:
 parse_wasm_benchmarks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example wasm_benchmarks_parser \
-	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
+	--features=shortint,internal-keycache \
 	-- wasm_benchmark_results.json

 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
 write_params_to_file: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example write_params_to_file \
-	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache
+	--example write_params_to_file --features=boolean,shortint,internal-keycache

 .PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
 clone_backward_compat_data:
@@ -1286,26 +1286,26 @@ tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
 .PHONY: regex_engine # Run regex_engine example
 regex_engine: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example regex_engine \
-	--features=$(TARGET_ARCH_FEATURE),integer \
+	--example regex_engine --features=integer \
 	-- $(REGEX_STRING) $(REGEX_PATTERN)

 .PHONY: dark_market # Run dark market example
 dark_market: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example dark_market \
-	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
+	--features=integer,internal-keycache \
 	-- fhe-modified fhe-parallel plain fhe

 .PHONY: sha256_bool # Run sha256_bool example
 sha256_bool: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example sha256_bool \
-	--features=$(TARGET_ARCH_FEATURE),boolean
+	--example sha256_bool --features=boolean

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested check_intra_md_links \
-clippy_all tfhe_lints check_compile_tests
+clippy_all check_compile_tests
+# TFHE lints deactivated as it's incompatible with 1.83 - temporary
+# tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change
--- a/README.md
+++ b/README.md
@@ -70,22 +70,8 @@ production-ready library for all the advanced features of TFHE.
 ### Cargo.toml configuration
 To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:

-+ For x86_64-based machines running Unix-like OSes:
-
 ```toml
-tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-unix"] }
-```
-
-+ For Apple Silicon or aarch64-based machines running Unix-like OSes:
-
-```toml
-tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
-```
-
-+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
-
-```toml
-tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
+tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
 ```

 > [!Note]
--- a/apps/trivium/Cargo.toml
+++ b/apps/trivium/Cargo.toml
@@ -6,15 +6,8 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
-rayon = { version = "1.7.0"}
-
-[target.'cfg(target_arch = "x86_64")'.dependencies.tfhe]
-path = "../../tfhe"
-features = [ "boolean", "shortint", "integer", "x86_64" ]
-
-[target.'cfg(target_arch = "aarch64")'.dependencies.tfhe]
-path = "../../tfhe"
-features = [ "boolean", "shortint", "integer", "aarch64-unix" ]
+rayon = { workspace = true }
+tfhe = { path = "../../tfhe", features = [ "boolean", "shortint", "integer" ] }

 [dev-dependencies]
 criterion = { version = "0.5.1", features = [ "html_reports" ]}
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -38,6 +38,7 @@ template <typename Torus> struct int_compression {

      scratch_packing_keyswitch_lwe_list_to_glwe_64(
          streams[0], gpu_indexes[0], &fp_ks_buffer,
+          compression_params.small_lwe_dimension,
          compression_params.glwe_dimension, compression_params.polynomial_size,
          num_radix_blocks, true);
    }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -46,7 +46,14 @@ void scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory);
-
+void scratch_cuda_apply_many_univariate_lut_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t num_many_lut, bool allocate_gpu_memory);
 void cuda_apply_univariate_lut_kb_64(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count, void *output_radix_lwe,
@@ -440,5 +447,41 @@ void cleanup_cuda_integer_abs_inplace(void *const *streams,
                                      uint32_t gpu_count,
                                      int8_t **mem_ptr_void);

+void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory);
+
+void cuda_integer_are_all_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
+
+void cleanup_cuda_integer_are_all_comparisons_block_true(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
+void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory);
+
+void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
+
+void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -479,7 +479,6 @@ template <typename Torus> struct int_radix_lut {
        cuda_memcpy_async_gpu_to_gpu(dst_lut_indexes, src_lut_indexes,
                                     num_blocks * sizeof(Torus), streams[i],
                                     gpu_indexes[i]);
-        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
      }
    }
  }
@@ -1540,10 +1539,12 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
    cuda_memset_async(grouping_pgns, 0, num_groups * big_lwe_size_bytes,
                      streams[0], gpu_indexes[0]);

-    prepared_blocks = (Torus *)cuda_malloc_async(
-        num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-    cuda_memset_async(prepared_blocks, 0, num_radix_blocks * big_lwe_size_bytes,
-                      streams[0], gpu_indexes[0]);
+    prepared_blocks =
+        (Torus *)cuda_malloc_async((num_radix_blocks + 1) * big_lwe_size_bytes,
+                                   streams[0], gpu_indexes[0]);
+    cuda_memset_async(prepared_blocks, 0,
+                      (num_radix_blocks + 1) * big_lwe_size_bytes, streams[0],
+                      gpu_indexes[0]);

    resolved_carries = (Torus *)cuda_malloc_async(
        (num_groups + 1) * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
@@ -1773,7 +1774,6 @@ template <typename Torus> struct int_sc_prop_memory {
  uint32_t num_many_lut;
  uint32_t lut_stride;

-  uint32_t group_size;
  uint32_t num_groups;
  Torus *output_flag;
  Torus *last_lhs;
@@ -1781,8 +1781,6 @@ template <typename Torus> struct int_sc_prop_memory {
  int_radix_lut<Torus> *lut_message_extract;

  int_radix_lut<Torus> *lut_overflow_flag_prep;
-  int_radix_lut<Torus> *lut_overflow_flag_last;
-  int_radix_lut<Torus> *lut_carry_flag_last;

  int_shifted_blocks_and_states_memory<Torus> *shifted_blocks_state_mem;
  int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
@@ -1792,8 +1790,6 @@ template <typename Torus> struct int_sc_prop_memory {
  uint32_t requested_flag;

  uint32_t active_gpu_count;
-  cudaStream_t *sub_streams_1;
-  cudaStream_t *sub_streams_2;

  cudaEvent_t *incoming_events1;
  cudaEvent_t *incoming_events2;
@@ -1818,7 +1814,6 @@ template <typename Torus> struct int_sc_prop_memory {
    uint32_t block_modulus = message_modulus * carry_modulus;
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
-    group_size = grouping_size;
    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;

    num_many_lut = 2; // many luts apply 2 luts
@@ -1835,8 +1830,8 @@ template <typename Torus> struct int_sc_prop_memory {

    //  Step 3 elements
    lut_message_extract =
-        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
-                                 num_radix_blocks, allocate_gpu_memory);
+        new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
+                                 num_radix_blocks + 1, allocate_gpu_memory);
    // lut for the first block in the first grouping
    auto f_message_extract = [message_modulus](Torus block) -> Torus {
      return (block >> 1) % message_modulus;
@@ -1852,8 +1847,9 @@ template <typename Torus> struct int_sc_prop_memory {

    // This store a single block that with be used to store the overflow or
    // carry results
-    output_flag = (Torus *)cuda_malloc_async(big_lwe_size_bytes, streams[0],
-                                             gpu_indexes[0]);
+    output_flag =
+        (Torus *)cuda_malloc_async(big_lwe_size_bytes * (num_radix_blocks + 1),
+                                   streams[0], gpu_indexes[0]);
    cuda_memset_async(output_flag, 0, big_lwe_size_bytes, streams[0],
                      gpu_indexes[0]);

@@ -1912,9 +1908,6 @@ template <typename Torus> struct int_sc_prop_memory {
    // It seems that this lut could be apply together with the other one but for
    // now we won't do it
    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
-      lut_overflow_flag_last = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);
-
      auto f_overflow_last = [num_radix_blocks,
                              requested_flag_in](Torus block) -> Torus {
        uint32_t position = (num_radix_blocks == 1 &&
@@ -1930,39 +1923,57 @@ template <typename Torus> struct int_sc_prop_memory {
          return does_overflow_if_carry_is_0;
        }
      };
-      auto overflow_flag_last = lut_overflow_flag_last->get_lut(0, 0);
+      auto overflow_flag_last = lut_message_extract->get_lut(0, 1);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_overflow_last);

-      lut_overflow_flag_last->broadcast_lut(streams, gpu_indexes, 0);
+      Torus *h_lut_indexes =
+          (Torus *)malloc((num_radix_blocks + 1) * sizeof(Torus));
+      for (int index = 0; index < num_radix_blocks + 1; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+      cuda_memcpy_async_to_gpu(
+          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
+          (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+      lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
+      free(h_lut_indexes);
    }
    if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
-      lut_carry_flag_last = new int_radix_lut<Torus>(
-          streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory);

      auto f_carry_last = [](Torus block) -> Torus {
        return ((block >> 2) & 1);
      };
-      auto carry_flag_last = lut_carry_flag_last->get_lut(0, 0);
+      auto carry_flag_last = lut_message_extract->get_lut(0, 1);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_carry_last);

-      lut_carry_flag_last->broadcast_lut(streams, gpu_indexes, 0);
+      Torus *h_lut_indexes =
+          (Torus *)malloc((num_radix_blocks + 1) * sizeof(Torus));
+      for (int index = 0; index < num_radix_blocks + 1; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+      cuda_memcpy_async_to_gpu(
+          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
+          (num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+      lut_message_extract->broadcast_lut(streams, gpu_indexes, 0);
+      free(h_lut_indexes);
    }

    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-    sub_streams_1 =
-        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 =
-        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < active_gpu_count; j++) {
-      sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
-      sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
-    }

    incoming_events1 =
        (cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
@@ -1998,24 +2009,10 @@ template <typename Torus> struct int_sc_prop_memory {

    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // In case of overflow
      lut_overflow_flag_prep->release(streams, gpu_indexes, gpu_count);
-      lut_overflow_flag_last->release(streams, gpu_indexes, gpu_count);
      delete lut_overflow_flag_prep;
-      delete lut_overflow_flag_last;
      cuda_drop_async(last_lhs, streams[0], gpu_indexes[0]);
      cuda_drop_async(last_rhs, streams[0], gpu_indexes[0]);
    }
-    if (requested_flag == outputFlag::FLAG_CARRY) { // In case of carry
-      lut_carry_flag_last->release(streams, gpu_indexes, gpu_count);
-      delete lut_carry_flag_last;
-    }
-
-    // release sub streams
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
-      cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
-    }
-    free(sub_streams_1);
-    free(sub_streams_2);

    // release events
    for (uint j = 0; j < active_gpu_count; j++) {
@@ -2955,14 +2952,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {

 template <typename Torus> struct int_cmux_buffer {
  int_radix_lut<Torus> *predicate_lut;
-  int_radix_lut<Torus> *inverted_predicate_lut;
  int_radix_lut<Torus> *message_extract_lut;

-  Torus *tmp_true_ct;
-  Torus *tmp_false_ct;
-
-  int_zero_out_if_buffer<Torus> *zero_if_true_buffer;
-  int_zero_out_if_buffer<Torus> *zero_if_false_buffer;
+  Torus *buffer_in;
+  Torus *buffer_out;
+  Torus *condition_array;

  int_radix_params params;

@@ -2978,17 +2972,12 @@ template <typename Torus> struct int_cmux_buffer {
      Torus big_size =
          (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);

-      tmp_true_ct =
-          (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
-      tmp_false_ct =
-          (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
-
-      zero_if_true_buffer = new int_zero_out_if_buffer<Torus>(
-          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-          allocate_gpu_memory);
-      zero_if_false_buffer = new int_zero_out_if_buffer<Torus>(
-          streams, gpu_indexes, gpu_count, params, num_radix_blocks,
-          allocate_gpu_memory);
+      buffer_in =
+          (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
+      buffer_out =
+          (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
+      condition_array =
+          (Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);

      auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus {
        return predicate_lut_f(condition) ? 0 : block;
@@ -3002,12 +2991,8 @@ template <typename Torus> struct int_cmux_buffer {
      };

      predicate_lut =
-          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
-                                   num_radix_blocks, allocate_gpu_memory);
-
-      inverted_predicate_lut =
-          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
-                                   num_radix_blocks, allocate_gpu_memory);
+          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
+                                   2 * num_radix_blocks, allocate_gpu_memory);

      message_extract_lut =
          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
@@ -3016,21 +3001,33 @@ template <typename Torus> struct int_cmux_buffer {
      generate_device_accumulator_bivariate<Torus>(
          streams[0], gpu_indexes[0], predicate_lut->get_lut(0, 0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, lut_f);
+          params.carry_modulus, inverted_lut_f);

      generate_device_accumulator_bivariate<Torus>(
-          streams[0], gpu_indexes[0], inverted_predicate_lut->get_lut(0, 0),
+          streams[0], gpu_indexes[0], predicate_lut->get_lut(0, 1),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
-          params.carry_modulus, inverted_lut_f);
+          params.carry_modulus, lut_f);

      generate_device_accumulator<Torus>(
          streams[0], gpu_indexes[0], message_extract_lut->get_lut(0, 0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, message_extract_lut_f);
+      Torus *h_lut_indexes =
+          (Torus *)malloc(2 * num_radix_blocks * sizeof(Torus));
+      for (int index = 0; index < 2 * num_radix_blocks; index++) {
+        if (index < num_radix_blocks) {
+          h_lut_indexes[index] = 0;
+        } else {
+          h_lut_indexes[index] = 1;
+        }
+      }
+      cuda_memcpy_async_to_gpu(
+          predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
+          2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);

      predicate_lut->broadcast_lut(streams, gpu_indexes, 0);
-      inverted_predicate_lut->broadcast_lut(streams, gpu_indexes, 0);
      message_extract_lut->broadcast_lut(streams, gpu_indexes, 0);
+      free(h_lut_indexes);
    }
  }

@@ -3038,18 +3035,12 @@ template <typename Torus> struct int_cmux_buffer {
               uint32_t gpu_count) {
    predicate_lut->release(streams, gpu_indexes, gpu_count);
    delete predicate_lut;
-    inverted_predicate_lut->release(streams, gpu_indexes, gpu_count);
-    delete inverted_predicate_lut;
    message_extract_lut->release(streams, gpu_indexes, gpu_count);
    delete message_extract_lut;

-    zero_if_true_buffer->release(streams, gpu_indexes, gpu_count);
-    delete zero_if_true_buffer;
-    zero_if_false_buffer->release(streams, gpu_indexes, gpu_count);
-    delete zero_if_false_buffer;
-
-    cuda_drop_async(tmp_true_ct, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_false_ct, streams[0], gpu_indexes[0]);
+    cuda_drop_async(buffer_in, streams[0], gpu_indexes[0]);
+    cuda_drop_async(buffer_out, streams[0], gpu_indexes[0]);
+    cuda_drop_async(condition_array, streams[0], gpu_indexes[0]);
  }
 };

@@ -3063,7 +3054,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
  // This map store LUTs that checks the equality between some input and values
  // of interest in are_all_block_true(), as with max_value (the maximum message
  // value).
-  std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;
+  int_radix_lut<Torus> *is_max_value;

  int_are_all_block_true_buffer(cudaStream_t const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -3084,16 +3075,26 @@ template <typename Torus> struct int_are_all_block_true_buffer {
      tmp_out = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
                                               num_radix_blocks * sizeof(Torus),
                                           streams[0], gpu_indexes[0]);
+      is_max_value =
+          new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
+                                   max_chunks, allocate_gpu_memory);
+      auto is_max_value_f = [max_value](Torus x) -> Torus {
+        return x == max_value;
+      };
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0], is_max_value->get_lut(0, 0),
+          params.glwe_dimension, params.polynomial_size, params.message_modulus,
+          params.carry_modulus, is_max_value_f);
+
+      is_max_value->broadcast_lut(streams, gpu_indexes, 0);
    }
  }

  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
               uint32_t gpu_count) {
-    for (auto &lut : is_equal_to_lut_map) {
-      lut.second->release(streams, gpu_indexes, gpu_count);
-      delete (lut.second);
-    }
-    is_equal_to_lut_map.clear();
+    is_max_value->release(streams, gpu_indexes, gpu_count);
+    delete (is_max_value);

    cuda_drop_async(tmp_block_accumulated, streams[0], gpu_indexes[0]);
    cuda_drop_async(tmp_out, streams[0], gpu_indexes[0]);
@@ -3270,8 +3271,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
  int_radix_params params;
  COMPARISON_TYPE op;

-  Torus *tmp_packed_left;
-  Torus *tmp_packed_right;
+  Torus *tmp_packed;

  std::function<Torus(Torus)> operator_f;

@@ -3308,11 +3308,8 @@ template <typename Torus> struct int_comparison_diff_buffer {

      Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);

-      tmp_packed_left = (Torus *)cuda_malloc_async(
-          big_size * (num_radix_blocks / 2), streams[0], gpu_indexes[0]);
-
-      tmp_packed_right = (Torus *)cuda_malloc_async(
-          big_size * (num_radix_blocks / 2), streams[0], gpu_indexes[0]);
+      tmp_packed = (Torus *)cuda_malloc_async(big_size * num_radix_blocks,
+                                              streams[0], gpu_indexes[0]);

      tree_buffer = new int_tree_sign_reduction_buffer<Torus>(
          streams, gpu_indexes, gpu_count, operator_f, params, num_radix_blocks,
@@ -3335,8 +3332,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
    reduce_signs_lut->release(streams, gpu_indexes, gpu_count);
    delete reduce_signs_lut;

-    cuda_drop_async(tmp_packed_left, streams[0], gpu_indexes[0]);
-    cuda_drop_async(tmp_packed_right, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_packed, streams[0], gpu_indexes[0]);
    cuda_drop_async(tmp_signs_a, streams[0], gpu_indexes[0]);
    cuda_drop_async(tmp_signs_b, streams[0], gpu_indexes[0]);
  }
@@ -3685,9 +3681,9 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          [shifted_mask](Torus x) -> Torus { return x & shifted_mask; };

      masking_luts_1[i] = new int_radix_lut<Torus>(
-          streams, gpu_indexes, 1, params, 1, 1, true);
+          streams, gpu_indexes, gpu_count, params, 1, 1, true);
      masking_luts_2[i] = new int_radix_lut<Torus>(
-          streams, gpu_indexes, 1, params, 1, num_blocks, true);
+          streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);

      int_radix_lut<Torus> *luts[2] = {masking_luts_1[i], masking_luts_2[i]};

@@ -3704,7 +3700,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
    // both of them are equal but because they are used in two different
    // executions in parallel we need two different pbs_buffers.
    message_extract_lut_1 = new int_radix_lut<Torus>(
-        streams, gpu_indexes, 1, params, 1, num_blocks, true);
+        streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);
    message_extract_lut_2 = new int_radix_lut<Torus>(
        streams, gpu_indexes, gpu_count, params, 1, num_blocks, true);

@@ -3816,16 +3812,16 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    this->params = params;
    shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
-        streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
+        streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
        params, 2 * num_blocks, true);

    shift_mem_2 = new int_logical_scalar_shift_buffer<Torus>(
-        streams, gpu_indexes, 1, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
+        streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
        params, 2 * num_blocks, true);

    uint32_t compute_overflow = 1;
    overflow_sub_mem = new int_borrow_prop_memory<Torus>(
-        streams, gpu_indexes, 1, params, num_blocks, compute_overflow,
+        streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
        true);
    uint32_t group_size = overflow_sub_mem->group_size;
    bool use_seq = overflow_sub_mem->prop_simu_group_carries_mem
@@ -3834,7 +3830,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                                    group_size, use_seq);

    comparison_buffer = new int_comparison_buffer<Torus>(
-        streams, gpu_indexes, 1, COMPARISON_TYPE::NE, params,
+        streams, gpu_indexes, gpu_count, COMPARISON_TYPE::NE, params,
        num_blocks, false, true);

    init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
@@ -4275,12 +4271,15 @@ template <typename Torus> struct int_scalar_mul_buffer {
  Torus *preshifted_buffer;
  Torus *all_shifted_buffer;
  int_sc_prop_memory<Torus> *sc_prop_mem;
+  bool anticipated_buffers_drop;

  int_scalar_mul_buffer(cudaStream_t const *streams,
                        uint32_t const *gpu_indexes, uint32_t gpu_count,
                        int_radix_params params, uint32_t num_radix_blocks,
-                        bool allocate_gpu_memory) {
+                        bool allocate_gpu_memory,
+                        bool anticipated_buffer_drop) {
    this->params = params;
+    this->anticipated_buffers_drop = anticipated_buffer_drop;

    if (allocate_gpu_memory) {
      uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus);
@@ -4328,6 +4327,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
    delete sum_ciphertexts_vec_mem;
    delete sc_prop_mem;
    cuda_drop_async(all_shifted_buffer, streams[0], gpu_indexes[0]);
+    if (!anticipated_buffers_drop) {
+      cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+      logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
+      delete (logical_scalar_shift_buffer);
+    }
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -21,8 +21,8 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(

 void scratch_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory);
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_lwes, bool allocate_gpu_memory);

 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, void *glwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -0,0 +1,358 @@
+#ifndef CNCRT_FAST_KS_CUH
+#define CNCRT_FAST_KS_CUH
+
+#undef NDEBUG
+#include <assert.h>
+
+#include "device.h"
+#include "gadget.cuh"
+#include "helper_multi_gpu.h"
+#include "keyswitch.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "torus.cuh"
+#include "utils/helper.cuh"
+#include "utils/kernel_dimensions.cuh"
+#include <thread>
+#include <vector>
+
+#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
+
+const int BLOCK_SIZE_GEMM = 64;
+const int THREADS_GEMM = 8;
+const int BLOCK_SIZE_DECOMP = 8;
+
+template <typename Torus> uint64_t get_shared_mem_size_tgemm() {
+  return BLOCK_SIZE_GEMM * THREADS_GEMM * 2 * sizeof(Torus);
+}
+
+__host__ inline bool can_use_pks_fast_path(uint32_t lwe_dimension,
+                                           uint32_t num_lwe,
+                                           uint32_t polynomial_size,
+                                           uint32_t level_count,
+                                           uint32_t glwe_dimension) {
+  // TODO: activate it back, fix tests and extend to level_count > 1
+  return false;
+}
+
+// Initialize decomposition by performing rounding
+// and decomposing one level of an array of Torus LWEs. Only
+// decomposes the mask elements of the incoming LWEs.
+template <typename Torus, typename TorusVec>
+__global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,
+                                         uint32_t lwe_dimension,
+                                         uint32_t num_lwe, uint32_t base_log,
+                                         uint32_t level_count) {
+
+  // index of this LWE ct in the buffer
+  auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // index of the LWE sample in the LWE ct
+  auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension)
+    return;
+
+  // Input LWE array is [mask_0, .., mask_lwe_dim, message] and
+  // we only decompose the mask. Thus the stride for reading
+  // is lwe_dimension + 1, while for writing it is lwe_dimension
+  auto read_val_idx = lwe_idx * (lwe_dimension + 1) + lwe_sample_idx;
+  auto write_val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
+
+  Torus a_i = lwe_in[read_val_idx];
+
+  Torus state = init_decomposer_state(a_i, base_log, level_count);
+
+  Torus mod_b_mask = (1ll << base_log) - 1ll;
+  lwe_out[write_val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
+}
+
+// Continue decomposiion of an array of Torus elements in place. Supposes
+// that the array contains already decomposed elements and
+// computes the new decomposed level in place.
+template <typename Torus, typename TorusVec>
+__global__ void
+decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
+                                 uint32_t num_lwe, uint32_t base_log,
+                                 uint32_t level_count) {
+
+  // index of this LWE ct in the buffer
+  auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // index of the LWE sample in the LWE ct
+  auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension)
+    return;
+
+  auto val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
+
+  Torus state = buffer_in[val_idx];
+
+  Torus mod_b_mask = (1ll << base_log) - 1ll;
+
+  buffer_in[val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
+}
+
+// Multiply matrices A, B of size (M, K), (K, N) respectively
+// with K as the inner dimension.
+//
+// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
+// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
+// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
+// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
+template <typename Torus, typename TorusVec>
+__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
+                      int stride_B, Torus *C) {
+
+  const int BM = BLOCK_SIZE_GEMM;
+  const int BN = BLOCK_SIZE_GEMM;
+  const int BK = THREADS_GEMM;
+  const int TM = THREADS_GEMM;
+
+  const uint cRow = blockIdx.y;
+  const uint cCol = blockIdx.x;
+
+  const uint totalResultsBlocktile = BM * BN;
+  const int threadCol = threadIdx.x % BN;
+  const int threadRow = threadIdx.x / BN;
+
+  // Allocate space for the current block tile in shared memory
+  __shared__ Torus As[BM * BK];
+  __shared__ Torus Bs[BK * BN];
+
+  // Initialize the pointers to the input blocks from A, B
+  // Tiles from these blocks are loaded to shared memory
+  A += cRow * BM * K;
+  B += cCol * BN;
+
+  // Each thread will handle multiple sub-blocks
+  const uint innerColA = threadIdx.x % BK;
+  const uint innerRowA = threadIdx.x / BK;
+  const uint innerColB = threadIdx.x % BN;
+  const uint innerRowB = threadIdx.x / BN;
+
+  // allocate thread-local cache for results in registerfile
+  Torus threadResults[TM] = {0};
+
+  auto row_A = cRow * BM + innerRowA;
+  auto col_B = cCol * BN + innerColB;
+
+  // For each thread, loop over block tiles
+  for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
+    auto col_A = bkIdx + innerColA;
+    auto row_B = bkIdx + innerRowB;
+
+    if (row_A < M && col_A < K) {
+      As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
+    } else {
+      As[innerRowA * BK + innerColA] = 0;
+    }
+
+    if (col_B < N && row_B < K) {
+      Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
+    } else {
+      Bs[innerRowB * BN + innerColB] = 0;
+    }
+    __syncthreads();
+
+    // Advance blocktile for the next iteration of this loop
+    A += BK;
+    B += BK * stride_B;
+
+    // calculate per-thread results
+    for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
+      // we make the dotproduct loop the outside loop, which facilitates
+      // reuse of the Bs entry, which we can cache in a tmp var.
+      Torus tmp = Bs[dotIdx * BN + threadCol];
+      for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+        threadResults[resIdx] +=
+            As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
+      }
+    }
+    __syncthreads();
+  }
+
+  // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
+  // BLOCK_SIZE_GEMM)
+  C += cRow * BM * N + cCol * BN;
+
+  // write out the results
+  for (uint resIdx = 0; resIdx < TM; ++resIdx) {
+    int outRow = cRow * BM + threadRow * TM + resIdx;
+    int outCol = cCol * BN + threadCol;
+
+    if (outRow >= M)
+      continue;
+    if (outCol >= N)
+      continue;
+
+    C[(threadRow * TM + resIdx) * N + threadCol] += threadResults[resIdx];
+  }
+}
+
+// Finish the keyswitching operation and prepare GLWEs for accumulation.
+// 1. Finish the keyswitching computation partially performed with a GEMM:
+//  - negate the dot product between the GLWE and KSK polynomial
+//  - add the GLWE message for the N-th polynomial coeff in the message poly
+// 2. Rotate each of the GLWE . KSK poly dot products to
+//    prepare them for accumulation into a single GLWE
+template <typename Torus>
+__global__ void polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C(
+    Torus *in_glwe_buffer, Torus *out_glwe_buffer, Torus const *lwe_array,
+    uint32_t lwe_dimension, uint32_t num_glwes, uint32_t polynomial_size,
+    uint32_t glwe_dimension) {
+
+  uint32_t glwe_id = blockIdx.x * blockDim.x + threadIdx.x;
+  uint32_t degree = glwe_id; // lwe 0 rotate 0, lwe 1 rotate 1, .. , lwe
+                             // poly_size-1 rotate poly_size-1
+  uint32_t coeffIdx = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (glwe_id >= num_glwes)
+    return;
+  if (coeffIdx >= polynomial_size)
+    return;
+
+  auto in_poly =
+      in_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1);
+  auto out_result =
+      out_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1);
+  if (coeffIdx == 0) {
+    // Add the message value of the input LWE (`C`) to the N-th coefficient
+    // in the GLWE . KSK dot product
+
+    // The C is added to the first position of the last polynomial in the GLWE
+    // which has (glwe_dimension+1) polynomials
+    // The C value is extracted as the last value of the LWE ct. (of index
+    // glwe_id) the LWEs have (polynomial_size + 1) values
+    in_poly[polynomial_size * glwe_dimension] =
+        lwe_array[glwe_id * (lwe_dimension + 1) + lwe_dimension] -
+        in_poly[polynomial_size * glwe_dimension];
+
+    for (int gi = 1; gi < glwe_dimension; ++gi)
+      in_poly[coeffIdx + gi * polynomial_size] =
+          -in_poly[coeffIdx + gi * polynomial_size];
+
+  } else {
+    // Otherwise simply negate the input coefficient
+    for (int gi = 1; gi < glwe_dimension + 1; ++gi)
+      in_poly[coeffIdx + gi * polynomial_size] =
+          -in_poly[coeffIdx + gi * polynomial_size];
+  }
+  // Negate all the coefficients for rotation for the first poly
+  in_poly[coeffIdx] = -in_poly[coeffIdx];
+
+  // rotate the body
+  polynomial_accumulate_monic_monomial_mul<Torus>(
+      out_result, in_poly, degree, coeffIdx, polynomial_size, 1, true);
+  // rotate the mask too
+  for (int gi = 1; gi < glwe_dimension + 1; ++gi)
+    polynomial_accumulate_monic_monomial_mul<Torus>(
+        out_result + gi * polynomial_size, in_poly + gi * polynomial_size,
+        degree, coeffIdx, polynomial_size, 1, true);
+}
+
+template <typename Torus, typename TorusVec>
+__host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
+    Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  // Optimization of packing keyswitch when packing many LWEs
+
+  if (level_count > 1) {
+    PANIC("Fast path PKS only supports level_count==1");
+  }
+
+  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaGetLastError());
+
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  // The fast path of PKS uses the scratch buffer (d_mem) differently than the
+  // old path: it needs to store the decomposed masks in the first half of this
+  // buffer and the keyswitched GLWEs in the second half of the buffer. Thus the
+  // scratch buffer for the fast path must determine the half-size of the
+  // scratch buffer as the max between the size of the GLWE and the size of the
+  // LWE-mask
+  int memory_unit = glwe_accumulator_size > lwe_dimension
+                        ? glwe_accumulator_size
+                        : lwe_dimension;
+
+  // ping pong the buffer between successive calls
+  // split the buffer in two parts of this size
+  auto d_mem_0 = (Torus *)fp_ks_buffer;
+  auto d_mem_1 = d_mem_0 + num_lwes * memory_unit;
+
+  // Set the scratch buffer to 0 as it is used to accumulate
+  // decomposition temporary results
+  cuda_memset_async(d_mem_1, 0, num_lwes * memory_unit * sizeof(Torus), stream,
+                    gpu_index);
+  check_cuda_error(cudaGetLastError());
+
+  // decompose LWEs
+  // don't decompose LWE body - the LWE has lwe_size + 1 elements. The last
+  // element, the body is ignored by rounding down the number of blocks assuming
+  // here that the LWE dimension is a multiple of the block size
+  dim3 grid_decomp(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP),
+                   CEIL_DIV(lwe_dimension, BLOCK_SIZE_DECOMP));
+  dim3 threads_decomp(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
+
+  // decompose first level
+  decompose_vectorize_init<Torus, TorusVec>
+      <<<grid_decomp, threads_decomp, 0, stream>>>(lwe_array_in, d_mem_0,
+                                                   lwe_dimension, num_lwes,
+                                                   base_log, level_count);
+  check_cuda_error(cudaGetLastError());
+
+  // gemm to ks the individual LWEs to GLWEs
+  dim3 grid_gemm(CEIL_DIV(glwe_accumulator_size, BLOCK_SIZE_GEMM),
+                 CEIL_DIV(num_lwes, BLOCK_SIZE_GEMM));
+  dim3 threads_gemm(BLOCK_SIZE_GEMM * THREADS_GEMM);
+
+  auto stride_KSK_buffer = glwe_accumulator_size;
+
+  uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
+  tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
+      num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
+      stride_KSK_buffer, d_mem_1);
+  check_cuda_error(cudaGetLastError());
+
+  /*
+    TODO: transpose key to generalize to level_count > 1
+
+    for (int li = 1; li < level_count; ++li) {
+      decompose_vectorize_step_inplace<Torus, TorusVec>
+          <<<grid_decomp, threads_decomp, 0, stream>>>(
+              d_mem_0, lwe_dimension, num_lwes, base_log, level_count);
+      check_cuda_error(cudaGetLastError());
+
+      tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size,
+    stream>>>( num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
+          fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
+      check_cuda_error(cudaGetLastError());
+    }
+  */
+
+  // should we include the mask in the rotation ??
+  dim3 grid_rotate(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP),
+                   CEIL_DIV(polynomial_size, BLOCK_SIZE_DECOMP));
+  dim3 threads_rotate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
+  // rotate the GLWEs
+  polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C<Torus>
+      <<<grid_rotate, threads_rotate, 0, stream>>>(
+          d_mem_1, d_mem_0, lwe_array_in, lwe_dimension, num_lwes,
+          polynomial_size, glwe_dimension);
+  check_cuda_error(cudaGetLastError());
+
+  dim3 grid_accumulate(
+      CEIL_DIV(polynomial_size * (glwe_dimension + 1), BLOCK_SIZE_DECOMP));
+  dim3 threads_accum(BLOCK_SIZE_DECOMP);
+
+  // accumulate to a single glwe
+  accumulate_glwes<Torus><<<grid_accumulate, threads_accum, 0, stream>>>(
+      glwe_out, d_mem_0, glwe_dimension, polynomial_size, num_lwes);
+
+  check_cuda_error(cudaGetLastError());
+}
+
+#endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -1,6 +1,8 @@
+#include "fast_packing_keyswitch.cuh"
 #include "keyswitch.cuh"
 #include "keyswitch.h"
 #include <cstdint>
+#include <stdio.h>

 /* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
 * Head out to the equivalent operation on 64 bits for more details.
@@ -53,15 +55,17 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(

 void scratch_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory) {
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_lwes, bool allocate_gpu_memory) {
  scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
+      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer, lwe_dimension,
      glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
 }
+
 /* Perform functional packing keyswitch on a batch of 64 bits input LWE
 * ciphertexts.
 */
+
 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, void *glwe_array_out,
    void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
@@ -69,13 +73,24 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_lwes) {

-  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index,
-      static_cast<uint64_t *>(glwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in),
-      static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
-      input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
-      base_log, level_count, num_lwes);
+  if (can_use_pks_fast_path(input_lwe_dimension, num_lwes,
+                            output_polynomial_size, level_count,
+                            output_glwe_dimension)) {
+    host_fast_packing_keyswitch_lwe_list_to_glwe<uint64_t, ulonglong4>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(glwe_array_out),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
+        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
+        base_log, level_count, num_lwes);
+  } else
+    host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+        static_cast<cudaStream_t>(stream), gpu_index,
+        static_cast<uint64_t *>(glwe_array_out),
+        static_cast<const uint64_t *>(lwe_array_in),
+        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
+        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
+        base_log, level_count, num_lwes);
 }

 void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -158,16 +158,20 @@ void execute_keyswitch_async(cudaStream_t const *streams,
 template <typename Torus>
 __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
-    bool allocate_gpu_memory) {
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t num_lwes, bool allocate_gpu_memory) {
  cudaSetDevice(gpu_index);

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

-  if (allocate_gpu_memory)
+  int memory_unit = glwe_accumulator_size > lwe_dimension
+                        ? glwe_accumulator_size
+                        : lwe_dimension;
+
+  if (allocate_gpu_memory) {
    *fp_ks_buffer = (int8_t *)cuda_malloc_async(
-        2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
-        gpu_index);
+        2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index);
+  }
 }

 // public functional packing keyswitch for a single LWE ciphertext
@@ -241,6 +245,7 @@ __global__ void packing_keyswitch_lwe_list_to_glwe(
  auto lwe_in = lwe_array_in + input_id * lwe_size;
  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
+
  // KS LWE to GLWE
  packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext<Torus>(
      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
@@ -293,8 +298,18 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
  dim3 grid(num_blocks, num_lwes);
  dim3 threads(num_threads);

+  // The fast path of PKS uses the scratch buffer (d_mem) differently:
+  // it needs to store the decomposed masks in the first half of this buffer
+  // and the keyswitched GLWEs in the second half of the buffer. Thus the
+  // scratch buffer for the fast path must determine the half-size of the
+  // scratch buffer as the max between the size of the GLWE and the size of the
+  // LWE-mask
+  int memory_unit = glwe_accumulator_size > lwe_dimension_in
+                        ? glwe_accumulator_size
+                        : lwe_dimension_in;
+
  auto d_mem = (Torus *)fp_ks_buffer;
-  auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
+  auto d_tmp_glwe_array_out = d_mem + num_lwes * memory_unit;

  // individually keyswitch each lwe
  packing_keyswitch_lwe_list_to_glwe<Torus><<<grid, threads, 0, stream>>>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -37,39 +37,32 @@ __host__ void host_integer_radix_cmux_kb(
    uint32_t num_radix_blocks) {

  auto params = mem_ptr->params;
-
-  // Since our CPU threads will be working on different streams we shall assert
-  // the work in the main stream is completed
-  auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
-  auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
-  for (uint j = 0; j < gpu_count; j++) {
-    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
-  }
-
-  auto mem_true = mem_ptr->zero_if_true_buffer;
-  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
-                     lwe_array_true, lwe_condition, mem_true,
-                     mem_ptr->inverted_predicate_lut, bsks, ksks,
-                     num_radix_blocks);
-  auto mem_false = mem_ptr->zero_if_false_buffer;
-  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
-                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
-                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
-                     num_radix_blocks);
-  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
-    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
-  }
-  for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
-    cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
+  Torus lwe_size = params.big_lwe_dimension + 1;
+  Torus radix_lwe_size = lwe_size * num_radix_blocks;
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in, lwe_array_true,
+                               radix_lwe_size * sizeof(Torus), streams[0],
+                               gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in + radix_lwe_size,
+                               lwe_array_false, radix_lwe_size * sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
+  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
+    cuda_memcpy_async_gpu_to_gpu(mem_ptr->condition_array + i * lwe_size,
+                                 lwe_condition, lwe_size * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
  }
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
+      mem_ptr->condition_array, bsks, ksks, 2 * num_radix_blocks,
+      mem_ptr->predicate_lut, params.message_modulus);

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
-  auto added_cts = mem_ptr->tmp_true_ct;
-  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
-                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
-                       params.big_lwe_dimension, num_radix_blocks);
+  auto mem_true = mem_ptr->buffer_out;
+  auto mem_false = &mem_ptr->buffer_out[radix_lwe_size];
+  auto added_cts = mem_true;
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
+                       mem_false, params.big_lwe_dimension, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -58,6 +58,9 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  case GE:
  case LT:
  case LE:
+    if (num_radix_blocks % 2 != 0)
+      PANIC("Cuda error (comparisons): the number of radix blocks has to be "
+            "even.")
    host_integer_radix_difference_check_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
@@ -68,6 +71,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    break;
  case MAX:
  case MIN:
+    if (num_radix_blocks % 2 != 0)
+      PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
    host_integer_radix_maxmin_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
@@ -89,3 +94,91 @@ void cleanup_cuda_integer_comparison(void *const *streams,
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
+
+void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
+      false, allocate_gpu_memory);
+}
+
+void cuda_integer_are_all_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
+
+  int_comparison_buffer<uint64_t> *buffer =
+      (int_comparison_buffer<uint64_t> *)mem_ptr;
+
+  host_integer_are_all_comparisons_block_true_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in), buffer, bsks,
+      (uint64_t **)(ksks), num_radix_blocks);
+}
+
+void cleanup_cuda_integer_are_all_comparisons_block_true(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_comparison_buffer<uint64_t> *mem_ptr =
+      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          big_lwe_dimension, small_lwe_dimension, ks_level,
+                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
+                          message_modulus, carry_modulus);
+
+  scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
+      false, allocate_gpu_memory);
+}
+
+void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
+
+  int_comparison_buffer<uint64_t> *buffer =
+      (int_comparison_buffer<uint64_t> *)mem_ptr;
+
+  host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in), buffer, bsks,
+      (uint64_t **)(ksks), num_radix_blocks);
+}
+
+void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_comparison_buffer<uint64_t> *mem_ptr =
+      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -58,7 +58,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
 template <typename Torus>
 __host__ void are_all_comparisons_block_true(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
    Torus *const *ksks, uint32_t num_radix_blocks) {

@@ -85,16 +85,19 @@ __host__ void are_all_comparisons_block_true(

  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    uint32_t chunk_length = std::min(max_value, remaining_blocks);
-    int num_chunks = remaining_blocks / chunk_length;
+    int num_chunks = (remaining_blocks + max_value - 1) / max_value;

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
    auto input_blocks = tmp_out;
    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
-    auto is_equal_to_num_blocks_map =
-        &are_all_block_true_buffer->is_equal_to_lut_map;
+    auto is_max_value_lut = are_all_block_true_buffer->is_max_value;
+    uint32_t chunk_lengths[num_chunks];
+    auto begin_remaining_blocks = remaining_blocks;
    for (int i = 0; i < num_chunks; i++) {
+      uint32_t chunk_length =
+          std::min(max_value, begin_remaining_blocks - i * max_value);
+      chunk_lengths[i] = chunk_length;
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
                                   input_blocks, big_lwe_dimension,
                                   chunk_length);
@@ -107,53 +110,50 @@ __host__ void are_all_comparisons_block_true(

    // Selects a LUT
    int_radix_lut<Torus> *lut;
-    auto broadcast_lut_should_be_called = false;
    if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
      // is_non_zero_lut_buffer LUT
      lut = mem_ptr->eq_buffer->is_non_zero_lut;
    } else {
-      if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
-          (*is_equal_to_num_blocks_map).end()) {
-        // The LUT is already computed
-        lut = (*is_equal_to_num_blocks_map)[chunk_length];
-      } else {
+      if (chunk_lengths[num_chunks - 1] != max_value) {
        // LUT needs to be computed
-        auto new_lut =
-            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
-                                     max_value, num_radix_blocks, true);
-
+        uint32_t chunk_length = chunk_lengths[num_chunks - 1];
        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
          return x == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            streams[0], gpu_indexes[0], new_lut->get_lut(0, 0), glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
+            streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
+            glwe_dimension, polynomial_size, message_modulus, carry_modulus,
            is_equal_to_num_blocks_lut_f);

-        // new_lut->broadcast_lut(streams, gpu_indexes, 0);
-        broadcast_lut_should_be_called = true;
-
-        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
-        lut = new_lut;
+        Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus));
+        for (int index = 0; index < num_chunks; index++) {
+          if (index == num_chunks - 1) {
+            h_lut_indexes[index] = 1;
+          } else {
+            h_lut_indexes[index] = 0;
+          }
+        }
+        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
+                                 h_lut_indexes, num_chunks * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+        is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
+        cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+        free(h_lut_indexes);
      }
+      lut = is_max_value_lut;
    }

    // Applies the LUT
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, 1, lwe_array_out, accumulator, bsks, ksks, 1,
-          lut);
+          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
+          ksks, 1, lut);
      return;
    } else {
-      if (broadcast_lut_should_be_called)
-        integer_radix_apply_univariate_lookup_table_kb<Torus>(
-            streams, gpu_indexes, 1, tmp_out, accumulator, bsks, ksks,
-            num_chunks, lut);
-      else
-        integer_radix_apply_univariate_lookup_table_kb<Torus>(
-            streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
-            num_chunks, lut);
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
+          num_chunks, lut);
    }
  }
 }
@@ -167,7 +167,7 @@ __host__ void are_all_comparisons_block_true(
 template <typename Torus>
 __host__ void is_at_least_one_comparisons_block_true(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
    Torus *const *ksks, uint32_t num_radix_blocks) {

@@ -189,14 +189,18 @@ __host__ void is_at_least_one_comparisons_block_true(
  uint32_t remaining_blocks = num_radix_blocks;
  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    uint32_t chunk_length = std::min(max_value, remaining_blocks);
-    int num_chunks = remaining_blocks / chunk_length;
+    int num_chunks = (remaining_blocks + max_value - 1) / max_value;

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
    auto input_blocks = mem_ptr->tmp_lwe_array_out;
    auto accumulator = buffer->tmp_block_accumulated;
+    uint32_t chunk_lengths[num_chunks];
+    auto begin_remaining_blocks = remaining_blocks;
    for (int i = 0; i < num_chunks; i++) {
+      uint32_t chunk_length =
+          std::min(max_value, begin_remaining_blocks - i * max_value);
+      chunk_lengths[i] = chunk_length;
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
                                   input_blocks, big_lwe_dimension,
                                   chunk_length);
@@ -458,10 +462,12 @@ __host__ void tree_sign_reduction(
  generate_device_accumulator<Torus>(
      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
      polynomial_size, message_modulus, carry_modulus, f);
+  last_lut->broadcast_lut(streams, gpu_indexes, 0);

  // Last leaf
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, 1, lwe_array_out, y, bsks, ksks, 1, last_lut);
+      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
+      last_lut);
 }

 template <typename Torus>
@@ -486,8 +492,9 @@ __host__ void host_integer_radix_difference_check_kb(
  if (carry_modulus >= message_modulus) {
    // Packing is possible
    // Pack inputs
-    Torus *packed_left = diff_buffer->tmp_packed_left;
-    Torus *packed_right = diff_buffer->tmp_packed_right;
+    Torus *packed_left = diff_buffer->tmp_packed;
+    Torus *packed_right =
+        diff_buffer->tmp_packed + num_radix_blocks / 2 * big_lwe_size;
    // In case the ciphertext is signed, the sign block and the one before it
    // are handled separately
    if (mem_ptr->is_signed) {
@@ -506,10 +513,7 @@ __host__ void host_integer_radix_difference_check_kb(
    auto identity_lut = mem_ptr->identity_lut;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
-        packed_num_radix_blocks, identity_lut);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
-        packed_num_radix_blocks, identity_lut);
+        2 * packed_num_radix_blocks, identity_lut);

    lhs = packed_left;
    rhs = packed_right;
@@ -538,11 +542,13 @@ __host__ void host_integer_radix_difference_check_kb(

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
+      Torus *packed_left = diff_buffer->tmp_packed;
+      Torus *packed_right =
+          diff_buffer->tmp_packed + num_radix_blocks / 2 * big_lwe_size;
      Torus *last_left_block_before_sign_block =
-          diff_buffer->tmp_packed_left + packed_num_radix_blocks * big_lwe_size;
+          packed_left + packed_num_radix_blocks * big_lwe_size;
      Torus *last_right_block_before_sign_block =
-          diff_buffer->tmp_packed_right +
-          packed_num_radix_blocks * big_lwe_size;
+          packed_right + packed_num_radix_blocks * big_lwe_size;
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
@@ -620,4 +626,35 @@ __host__ void host_integer_radix_maxmin_kb(
      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

+template <typename Torus>
+__host__ void host_integer_are_all_comparisons_block_true_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
+    Torus *const *ksks, uint32_t num_radix_blocks) {
+
+  auto eq_buffer = mem_ptr->eq_buffer;
+
+  // It returns a block encrypting 1 if all input blocks are 1
+  // otherwise the block encrypts 0
+  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
+                                        lwe_array_out, lwe_array_in, mem_ptr,
+                                        bsks, ksks, num_radix_blocks);
+}
+
+template <typename Torus>
+__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
+    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
+    Torus *const *ksks, uint32_t num_radix_blocks) {
+
+  auto eq_buffer = mem_ptr->eq_buffer;
+
+  // It returns a block encrypting 1 if all input blocks are 1
+  // otherwise the block encrypts 0
+  is_at_least_one_comparisons_block_true<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
+      bsks, ksks, num_radix_blocks);
+}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -2,6 +2,7 @@
 #define CUDA_INTEGER_COMPRESSION_CUH

 #include "ciphertext.h"
+#include "crypto/fast_packing_keyswitch.cuh"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer/compression/compression.h"
@@ -116,11 +117,21 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  while (rem_lwes > 0) {
    auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);

-    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
-        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
-        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
-        compression_params.polynomial_size, compression_params.ks_base_log,
-        compression_params.ks_level, chunk_size);
+    if (can_use_pks_fast_path(
+            input_lwe_dimension, chunk_size, compression_params.polynomial_size,
+            compression_params.ks_level, compression_params.glwe_dimension)) {
+      host_fast_packing_keyswitch_lwe_list_to_glwe<Torus, ulonglong4>(
+          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+          compression_params.polynomial_size, compression_params.ks_base_log,
+          compression_params.ks_level, chunk_size);
+    } else {
+      host_packing_keyswitch_lwe_list_to_glwe<Torus>(
+          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+          compression_params.polynomial_size, compression_params.ks_base_log,
+          compression_params.ks_level, chunk_size);
+    }

    rem_lwes -= chunk_size;
    lwe_subset += chunk_size * lwe_in_size;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -286,7 +286,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      uint32_t shifted_mask = full_message_mask >> shift_amount;

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, 1, interesting_divisor.last_block(),
+          streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
          interesting_divisor.last_block(), bsks, ksks, 1,
          mem_ptr->masking_luts_1[shifted_mask]);
    }; // trim_last_interesting_divisor_bits
@@ -315,7 +315,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      shifted_mask = shifted_mask & full_message_mask;

      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          streams, gpu_indexes, 1, divisor_ms_blocks.first_block(),
+          streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
          divisor_ms_blocks.first_block(), bsks, ksks, 1,
          mem_ptr->masking_luts_2[shifted_mask]);
    }; // trim_first_divisor_ms_bits
@@ -340,7 +340,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
                                    streams[0], gpu_indexes[0]);

      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, 1, interesting_remainder1.data, 1,
+          streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
          mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);

      tmp_radix.clone_from(interesting_remainder1, 0,
@@ -370,13 +370,13 @@ __host__ void host_unsigned_integer_div_rem_kb(
                                                 uint32_t const *gpu_indexes,
                                                 uint32_t gpu_count) {
      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
-          streams, gpu_indexes, 1, interesting_remainder2.data, 1,
+          streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
          mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
    }; // left_shift_interesting_remainder2

-    //for (uint j = 0; j < gpu_count; j++) {
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    //}
+    for (uint j = 0; j < gpu_count; j++) {
+      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+    }
    // interesting_divisor
    trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes,
                                       gpu_count);
@@ -389,12 +389,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // interesting_remainder2
    left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes,
                                      gpu_count);
-//    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
-      cuda_synchronize_stream(mem_ptr->sub_streams_1[0], gpu_indexes[0]);
-      cuda_synchronize_stream(mem_ptr->sub_streams_2[0], gpu_indexes[0]);
-      cuda_synchronize_stream(mem_ptr->sub_streams_3[0], gpu_indexes[0]);
-      cuda_synchronize_stream(mem_ptr->sub_streams_4[0], gpu_indexes[0]);
-//    }
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
+      cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
+      cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
+    }

    // if interesting_remainder1 != 0 -> interesting_remainder2 == 0
    // if interesting_remainder1 == 0 -> interesting_remainder2 != 0
@@ -438,7 +438,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
          streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
          merged_interesting_remainder.len);
      host_integer_overflowing_sub<uint64_t>(
-          streams, gpu_indexes, 1, new_remainder.data,
+          streams, gpu_indexes, gpu_count, new_remainder.data,
          (uint64_t *)merged_interesting_remainder.data,
          interesting_divisor.data, subtraction_overflowed.data,
          (const Torus *)nullptr, mem_ptr->overflow_sub_mem, bsks, ksks,
@@ -460,7 +460,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
        // But we are in the special case where scalar == 0
        // So we can skip some stuff
        host_compare_with_zero_equality<Torus>(
-            streams, gpu_indexes, 1, tmp_1.data, trivial_blocks.data,
+            streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
            mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
            mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);

@@ -468,7 +468,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
            ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);

        is_at_least_one_comparisons_block_true<Torus>(
-            streams, gpu_indexes, 1,
+            streams, gpu_indexes, gpu_count,
            at_least_one_upper_block_is_non_zero.data, tmp_1.data,
            mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
      }
@@ -482,7 +482,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count) {
          integer_radix_apply_univariate_lookup_table_kb<Torus>(
-              streams, gpu_indexes, 1,
+              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data, bsks, ksks,
              cleaned_merged_interesting_remainder.len,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -198,6 +198,27 @@ void scratch_cuda_apply_univariate_lut_kb_64(
      allocate_gpu_memory);
 }

+void scratch_cuda_apply_many_univariate_lut_kb_64(
+    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
+    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t grouping_factor, uint32_t num_radix_blocks,
+    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
+    uint32_t num_many_lut, bool allocate_gpu_memory) {
+
+  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
+                          glwe_dimension * polynomial_size, lwe_dimension,
+                          ks_level, ks_base_log, pbs_level, pbs_base_log,
+                          grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_radix_lut<uint64_t> **)mem_ptr,
+      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
+      num_many_lut, allocate_gpu_memory);
+}
+
 void cuda_apply_univariate_lut_kb_64(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count, void *output_radix_lwe,
@@ -237,7 +258,7 @@ void cuda_apply_many_univariate_lut_kb_64(

 void scratch_cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void *input_lut, uint32_t lwe_dimension,
+    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
@@ -251,8 +272,9 @@ void scratch_cuda_apply_bivariate_lut_kb_64(

  scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
-      num_radix_blocks, params, allocate_gpu_memory);
+      (int_radix_lut<uint64_t> **)mem_ptr,
+      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
+      allocate_gpu_memory);
 }

 void cuda_apply_bivariate_lut_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -1557,6 +1557,25 @@ void host_apply_univariate_lut_kb(cudaStream_t const *streams,
      num_blocks, mem);
 }

+template <typename Torus>
+void scratch_cuda_apply_many_univariate_lut_kb(
+    cudaStream_t const *streams, uint32_t const *gpu_indexes,
+    uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
+    uint32_t num_radix_blocks, int_radix_params params, uint32_t num_many_lut,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
+                                      1, num_radix_blocks, num_many_lut,
+                                      allocate_gpu_memory);
+  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
+  // 0
+  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(0, 0), (void *)input_lut,
+                           (params.glwe_dimension + 1) *
+                               params.polynomial_size * sizeof(Torus),
+                           streams[0], gpu_indexes[0]);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
+}
+
 template <typename Torus>
 void host_apply_many_univariate_lut_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1624,13 +1643,12 @@ void host_propagate_single_carry(cudaStream_t const *streams,
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
  auto lut_stride = mem->lut_stride;
  auto num_many_lut = mem->num_many_lut;
+  auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks;
  if (requested_flag == outputFlag::FLAG_OVERFLOW)
    PANIC("Cuda error: single carry propagation is not supported for overflow, "
          "try using add_and_propagate_single_carry");
@@ -1647,7 +1665,7 @@ void host_propagate_single_carry(cudaStream_t const *streams,

  if (requested_flag == outputFlag::FLAG_CARRY) {
    cuda_memcpy_async_gpu_to_gpu(
-        mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
+        output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
  }
  // Step 2
@@ -1667,45 +1685,40 @@ void host_propagate_single_carry(cudaStream_t const *streams,

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
-                         mem->output_flag,
+    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
                         mem->prop_simu_group_carries_mem->simulators +
                             (num_radix_blocks - 1) * big_lwe_size,
                         big_lwe_dimension, 1);
  }

-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-
-  // Step 3
-  //  Add carries and cleanup OutputFlag::None
  host_radix_sum_in_groups<Torus>(
-      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
      mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
      big_lwe_size, group_size);
-
-  auto message_extract = mem->lut_message_extract;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
-      bsks, ksks, num_radix_blocks, message_extract);
-
  if (requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
-                         mem->output_flag, mem->output_flag,
+    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
                         mem->prop_simu_group_carries_mem->resolved_carries +
                             (mem->num_groups - 1) * big_lwe_size,
                         big_lwe_dimension, 1);

+    cuda_memcpy_async_gpu_to_gpu(
+        prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
-        mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
+        streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
+        bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract);

-    cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
-                                 big_lwe_size_bytes, mem->sub_streams_2[0],
-                                 gpu_indexes[0]);
-  }
-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
-    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
+    cuda_memcpy_async_gpu_to_gpu(lwe_array, mem->output_flag,
+                                 big_lwe_size_bytes * num_radix_blocks,
+                                 streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(
+        carry_out, mem->output_flag + num_radix_blocks * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+  } else {
+    auto message_extract = mem->lut_message_extract;
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks,
+        num_radix_blocks, message_extract);
  }
 }

@@ -1721,13 +1734,12 @@ void host_add_and_propagate_single_carry(
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
-  auto carry_modulus = params.carry_modulus;
  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
  auto lut_stride = mem->lut_stride;
  auto num_many_lut = mem->num_many_lut;
+  auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks;

  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
    cuda_memcpy_async_gpu_to_gpu(
@@ -1754,12 +1766,12 @@ void host_add_and_propagate_single_carry(
  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
    auto lut_overflow_prep = mem->lut_overflow_flag_prep;
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, mem->output_flag, mem->last_lhs,
+        streams, gpu_indexes, gpu_count, output_flag, mem->last_lhs,
        mem->last_rhs, bsks, ksks, 1, lut_overflow_prep,
        lut_overflow_prep->params.message_modulus);
  } else if (requested_flag == outputFlag::FLAG_CARRY) {
    cuda_memcpy_async_gpu_to_gpu(
-        mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
+        output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
  }

@@ -1780,58 +1792,50 @@ void host_add_and_propagate_single_carry(

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
-                         mem->output_flag,
+    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
                         mem->prop_simu_group_carries_mem->simulators +
                             (num_radix_blocks - 1) * big_lwe_size,
                         big_lwe_dimension, 1);
  }

-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
  // Step 3
  //  Add carries and cleanup OutputFlag::None
  host_radix_sum_in_groups<Torus>(
-      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
      mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
      big_lwe_size, group_size);

-  auto message_extract = mem->lut_message_extract;
-  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      mem->sub_streams_1, gpu_indexes, gpu_count, lhs_array, prepared_blocks,
-      bsks, ksks, num_radix_blocks, message_extract);
-
  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
    if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
        uses_carry == 1) {
-      host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
-                           mem->output_flag, mem->output_flag, input_carries,
-                           big_lwe_dimension, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
+                           input_carries, big_lwe_dimension, 1);

    } else {

-      host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
-                           mem->output_flag, mem->output_flag,
+      host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
                           mem->prop_simu_group_carries_mem->resolved_carries +
                               (mem->num_groups - 1) * big_lwe_size,
                           big_lwe_dimension, 1);
    }
-    if (requested_flag == outputFlag::FLAG_OVERFLOW) {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
-          mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last);
-    } else {
-      integer_radix_apply_univariate_lookup_table_kb<Torus>(
-          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
-          mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
-    }
-    cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
-                                 big_lwe_size_bytes, mem->sub_streams_2[0],
-                                 gpu_indexes[0]);
-  }
-  for (int j = 0; j < mem->active_gpu_count; j++) {
-    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
-    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
+    cuda_memcpy_async_gpu_to_gpu(
+        prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
+        bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract);
+
+    cuda_memcpy_async_gpu_to_gpu(lhs_array, mem->output_flag,
+                                 big_lwe_size_bytes * num_radix_blocks,
+                                 streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(
+        carry_out, mem->output_flag + num_radix_blocks * big_lwe_size,
+        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+  } else {
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks,
+        num_radix_blocks, mem->lut_message_extract);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -22,6 +22,9 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
  case GE:
  case LT:
  case LE:
+    if (lwe_ciphertext_count % 2 != 0)
+      PANIC("Cuda error (scalar comparisons): the number of radix blocks has "
+            "to be even.")
    host_integer_radix_scalar_difference_check_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
@@ -32,6 +35,9 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    break;
  case MAX:
  case MIN:
+    if (lwe_ciphertext_count % 2 != 0)
+      PANIC("Cuda error (scalar max/min): the number of radix blocks has to be "
+            "even.")
    host_integer_radix_scalar_maxmin_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -141,8 +141,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    //////////////
    // lsb
-    Torus *lhs = diff_buffer->tmp_packed_left;
-    Torus *rhs = diff_buffer->tmp_packed_right;
+    Torus *lhs = diff_buffer->tmp_packed;
+    Torus *rhs =
+        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;

    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks,
@@ -210,8 +211,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
    uint32_t num_scalar_blocks = total_num_scalar_blocks;

-    Torus *lhs = diff_buffer->tmp_packed_left;
-    Torus *rhs = diff_buffer->tmp_packed_right;
+    Torus *lhs = diff_buffer->tmp_packed;
+    Torus *rhs =
+        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;

    pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks,
@@ -358,8 +360,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    //////////////
    // lsb
-    Torus *lhs = diff_buffer->tmp_packed_left;
-    Torus *rhs = diff_buffer->tmp_packed_right;
+    Torus *lhs = diff_buffer->tmp_packed;
+    Torus *rhs =
+        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;

    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks,
@@ -458,8 +461,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_sign_out =
        lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size;
-    Torus *lhs = diff_buffer->tmp_packed_left;
-    Torus *rhs = diff_buffer->tmp_packed_right;
+    Torus *lhs = diff_buffer->tmp_packed;
+    Torus *rhs =
+        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;

    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks - 1,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -36,7 +36,7 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(

  *mem_ptr =
      new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
-                                   num_radix_blocks, allocate_gpu_memory);
+                                   num_radix_blocks, allocate_gpu_memory, true);
 }

 template <typename T, class params>
@@ -94,9 +94,11 @@ __host__ void host_integer_scalar_mul_radix(
  }
  cuda_synchronize_stream(streams[0], gpu_indexes[0]);

-  cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
-  mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
-  delete (mem->logical_scalar_shift_buffer);
+  if (mem->anticipated_buffers_drop) {
+    cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+    mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
+    delete (mem->logical_scalar_shift_buffer);
+  }

  if (j == 0) {
    // lwe array = 0
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -136,7 +136,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap_tbc<Torus, Degree<512>>(
+    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -144,7 +144,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
+    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -393,7 +393,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap_cg<Torus, Degree<512>>(
+    host_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -401,7 +401,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap_cg<Torus, Degree<1024>>(
+    host_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -468,7 +468,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap<Torus, Degree<512>>(
+    host_programmable_bootstrap<Torus, AmortizedDegree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -476,7 +476,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap<Torus, Degree<1024>>(
+    host_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -480,20 +480,30 @@ __host__ void host_programmable_bootstrap(
  double2 *global_join_buffer = pbs_buffer->global_join_buffer;
  int8_t *d_mem = pbs_buffer->d_mem;

+  bool graphCreated = false;
+  cudaGraph_t graph;
+  cudaGraphExec_t instance;
  for (int i = 0; i < lwe_dimension; i++) {
-    execute_step_one<Torus, params>(
-        stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-        lwe_input_indexes, bootstrapping_key, global_accumulator,
-        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
-    execute_step_two<Torus, params>(
-        stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
-        lut_vector_indexes, bootstrapping_key, global_accumulator,
-        global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
-        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
-        num_many_lut, lut_stride);
+    if (!graphCreated) {
+      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+      execute_step_one<Torus, params>(
+          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+          lwe_input_indexes, bootstrapping_key, global_accumulator,
+          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+          glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+          partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
+      execute_step_two<Torus, params>(
+          stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
+          lut_vector_indexes, bootstrapping_key, global_accumulator,
+          global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
+          glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+          partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
+          num_many_lut, lut_stride);
+      cudaStreamEndCapture(stream, &graph);
+      cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+      graphCreated = true;
+    }
+    cudaGraphLaunch(instance, stream);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -649,29 +649,41 @@ __host__ void host_multi_bit_programmable_bootstrap(

  auto lwe_chunk_size = buffer->lwe_chunk_size;

+  bool graphCreated = false;
+  cudaGraph_t graph;
+  cudaGraphExec_t instance;
+
  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
       lwe_offset += lwe_chunk_size) {

-    // Compute a keybundle
-    execute_compute_keybundle<Torus, params>(
-        stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
-        buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, level_count, lwe_offset);
-    // Accumulate
-    uint32_t chunk_size = std::min(
-        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-    for (uint32_t j = 0; j < chunk_size; j++) {
-      execute_step_one<Torus, params>(
-          stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
-          lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, j, lwe_offset);
+    if (!graphCreated) {
+      cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal);
+      // Compute a keybundle
+      execute_compute_keybundle<Torus, params>(
+          stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
+          buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+          grouping_factor, level_count, lwe_offset);
+      // Accumulate
+      uint32_t chunk_size = std::min(
+          lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
+      for (uint32_t j = 0; j < chunk_size; j++) {
+        execute_step_one<Torus, params>(
+            stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
+            lwe_input_indexes, buffer, num_samples, lwe_dimension,
+            glwe_dimension, polynomial_size, base_log, level_count, j,
+            lwe_offset);

-      execute_step_two<Torus, params>(
-          stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
-          num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, j, lwe_offset, num_many_lut,
-          lut_stride);
+        execute_step_two<Torus, params>(
+            stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
+            num_samples, lwe_dimension, glwe_dimension, polynomial_size,
+            grouping_factor, level_count, j, lwe_offset, num_many_lut,
+            lut_stride);
+      }
+      cudaStreamEndCapture(stream, &graph);
+      cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+      graphCreated = true;
    }
+    cudaGraphLaunch(instance, stream);
  }
 }
 #endif // MULTIBIT_PBS_H
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -163,6 +163,29 @@ extern "C" {
        allocate_gpu_memory: bool,
    );
 }
+extern "C" {
+    pub fn scratch_cuda_apply_many_univariate_lut_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        input_lut: *const ffi::c_void,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_radix_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        num_many_lut: u32,
+        allocate_gpu_memory: bool,
+    );
+}
 extern "C" {
    pub fn cuda_apply_univariate_lut_kb_64(
        streams: *const *mut ffi::c_void,
@@ -1083,6 +1106,92 @@ extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+extern "C" {
+    pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_radix_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+    );
+}
+extern "C" {
+    pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+        num_radix_blocks: u32,
+    );
+}
+extern "C" {
+    pub fn cleanup_cuda_integer_are_all_comparisons_block_true(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
+extern "C" {
+    pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_radix_blocks: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: PBS_TYPE,
+        allocate_gpu_memory: bool,
+    );
+}
+extern "C" {
+    pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut ffi::c_void,
+        lwe_array_in: *const ffi::c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut ffi::c_void,
+        ksks: *const *mut ffi::c_void,
+        num_radix_blocks: u32,
+    );
+}
+extern "C" {
+    pub fn cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
+        streams: *const *mut ffi::c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr_void: *mut *mut i8,
+    );
+}
 extern "C" {
    pub fn cuda_keyswitch_lwe_ciphertext_vector_32(
        stream: *mut ffi::c_void,
@@ -1120,6 +1229,7 @@ extern "C" {
        stream: *mut ffi::c_void,
        gpu_index: u32,
        fp_ks_buffer: *mut *mut i8,
+        lwe_dimension: u32,
        glwe_dimension: u32,
        polynomial_size: u32,
        num_lwes: u32,
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -31,7 +31,7 @@ instance_type = "m6i.4xlarge"
 [backend.hyperstack.gpu-test]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-flavor_name = "n3-RTX-A6000x1"
+flavor_name = "n3-L40x1"

 [backend.hyperstack.single-h100]
 environment_name = "canada"
@@ -58,6 +58,12 @@ environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
 flavor_name = "n3-H100x8-NVLink"

+
+[backend.hyperstack.multi-h100-sxm5]
+environment_name = "canada"
+image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
+flavor_name = "n3-H100-SXM5x8"
+
 [backend.hyperstack.multi-a100-nvlink]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
--- a/scripts/get_arch_feature.sh
+++ b/scripts/get_arch_feature.sh
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-ARCH_FEATURE=x86_64
-
-IS_AARCH64="$( (uname -a | grep -c "arm64\|aarch64") || true)"
-
-if [[ "${IS_AARCH64}" != "0" ]]; then
-    ARCH_FEATURE=aarch64
-fi
-
-UNAME="$(uname)"
-
-if [[ "${UNAME}" == "Linux" || "${UNAME}" == "Darwin" ]]; then
-    ARCH_FEATURE="${ARCH_FEATURE}-unix"
-fi
-
-echo "${ARCH_FEATURE}"
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -10,6 +10,9 @@ function usage() {
    echo "--multi-bit               Run multi-bit tests only: default off"
    echo "--unsigned-only           Run only unsigned integer tests, by default both signed and unsigned tests are run"
    echo "--signed-only             Run only signed integer tests, by default both signed and unsigned tests are run"
+    echo "--nightly-tests           Run integer tests configured for nightly runs (3_3 params)"
+    echo "--fast-tests              Run integer set but skip a subset of longer tests"
+    echo "--long-tests              Run only long run integer tests"
    echo "--cargo-profile           The cargo profile used to build tests"
    echo "--backend                 Backend to use with tfhe-rs"
    echo "--avx512-support          Set to ON to enable avx512"
@@ -21,6 +24,7 @@ RUST_TOOLCHAIN="+stable"
 multi_bit_argument=
 sign_argument=
 fast_tests_argument=
+long_tests_argument=
 nightly_tests_argument=
 no_big_params_argument=
 cargo_profile="release"
@@ -91,6 +95,10 @@ if [[ "${FAST_TESTS}" == TRUE ]]; then
    fast_tests_argument=--fast-tests
 fi

+if [[ "${LONG_TESTS}" == TRUE ]]; then
+    long_tests_argument=--long-tests
+fi
+
 if [[ "${NIGHTLY_TESTS}" == TRUE ]]; then
    nightly_tests_argument=--nightly-tests
 fi
@@ -104,7 +112,6 @@ if [[ "${backend}" == "gpu" ]]; then
 fi

 CURR_DIR="$(dirname "$0")"
-ARCH_FEATURE="$("${CURR_DIR}/get_arch_feature.sh")"

 # TODO autodetect/have a finer CPU count depending on memory
 num_cpu_threads="$("${CURR_DIR}"/cpu_count.sh)"
@@ -138,32 +145,38 @@ if [[ "${backend}" == "gpu" ]]; then
    fi
 fi

-filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})
+filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${long_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})

 if [[ "${FAST_TESTS}" == "TRUE" ]]; then
    echo "Running 'fast' test set"
-else
+elif [[ "${LONG_TESTS}" == "FALSE" ]]; then
    echo "Running 'slow' test set"
 fi

+if [[ "${LONG_TESTS}" == "TRUE" ]]; then
+    echo "Running 'long run' test set"
+fi
+
 if [[ "${NIGHTLY_TESTS}" == "TRUE" ]]; then
    echo "Running 'nightly' test set"
 fi

+echo "${filter_expression}"
+
 cargo "${RUST_TOOLCHAIN}" nextest run \
    --tests \
    --cargo-profile "${cargo_profile}" \
    --package "${tfhe_package}" \
    --profile ci \
-    --features="${ARCH_FEATURE}",integer,internal-keycache,zk-pok,experimental,"${avx512_feature}","${gpu_feature}" \
+    --features=integer,internal-keycache,zk-pok,experimental,"${avx512_feature}","${gpu_feature}" \
    --test-threads "${test_threads}" \
    -E "$filter_expression"

-if [[ -z ${multi_bit_argument} ]]; then
+if [[ -z ${multi_bit_argument} && -z ${long_tests_argument} ]]; then
    cargo "${RUST_TOOLCHAIN}" test \
        --profile "${cargo_profile}" \
        --package "${tfhe_package}" \
-        --features="${ARCH_FEATURE}",integer,internal-keycache,experimental,"${avx512_feature}","${gpu_feature}" \
+        --features=integer,internal-keycache,experimental,"${avx512_feature}","${gpu_feature}" \
        --doc \
        -- --test-threads="${doctest_threads}" integer::"${gpu_feature}"
 fi
--- a/scripts/shortint-tests.sh
+++ b/scripts/shortint-tests.sh
@@ -65,7 +65,6 @@ if [[ "${FAST_TESTS}" == TRUE ]]; then
 fi

 CURR_DIR="$(dirname "$0")"
-ARCH_FEATURE="$("${CURR_DIR}/get_arch_feature.sh")"

 n_threads_small="$("${CURR_DIR}"/cpu_count.sh)"
 n_threads_big="${n_threads_small}"
@@ -94,7 +93,7 @@ if [[ "${BIG_TESTS_INSTANCE}" != TRUE ]]; then
        --cargo-profile "${cargo_profile}" \
        --package "${tfhe_package}" \
        --profile ci \
-        --features="${ARCH_FEATURE}",shortint,internal-keycache,zk-pok,experimental \
+        --features=shortint,internal-keycache,zk-pok,experimental \
        --test-threads "${n_threads_small}" \
        -E "${filter_expression_small_params}"

@@ -111,7 +110,7 @@ and not test(~smart_add_and_mul)"""
        --cargo-profile "${cargo_profile}" \
        --package "${tfhe_package}" \
        --profile ci \
-        --features="${ARCH_FEATURE}",shortint,internal-keycache,zk-pok,experimental \
+        --features=shortint,internal-keycache,zk-pok,experimental \
        --test-threads "${n_threads_big}" \
        --no-tests=warn \
        -E "${filter_expression_big_params}"
@@ -120,7 +119,7 @@ and not test(~smart_add_and_mul)"""
            cargo "${RUST_TOOLCHAIN}" test \
                --profile "${cargo_profile}" \
                --package "${tfhe_package}" \
-                --features="${ARCH_FEATURE}",shortint,internal-keycache,zk-pok,experimental \
+                --features=shortint,internal-keycache,zk-pok,experimental \
                --doc \
                -- shortint::
        fi
@@ -134,7 +133,7 @@ else
        --cargo-profile "${cargo_profile}" \
        --package "${tfhe_package}" \
        --profile ci \
-        --features="${ARCH_FEATURE}",shortint,internal-keycache,experimental \
+        --features=shortint,internal-keycache,experimental \
        --test-threads "${n_threads_big}" \
        -E "${filter_expression}"

@@ -142,7 +141,7 @@ else
        cargo "${RUST_TOOLCHAIN}" test \
            --profile "${cargo_profile}" \
            --package "${tfhe_package}" \
-            --features="${ARCH_FEATURE}",shortint,internal-keycache,experimental \
+            --features=shortint,internal-keycache,experimental \
            --doc \
            -- --test-threads="${n_threads_big}" shortint::
    fi
--- a/scripts/test_filtering.py
+++ b/scripts/test_filtering.py
@@ -26,6 +26,12 @@ parser.add_argument(
    action="store_true",
    help="Run only a small subset of test suite",
 )
+parser.add_argument(
+    "--long-tests",
+    dest="long_tests",
+    action="store_true",
+    help="Run only the long tests suite",
+)
 parser.add_argument(
    "--nightly-tests",
    dest="nightly_tests",
@@ -80,6 +86,7 @@ EXCLUDED_INTEGER_TESTS = [
    "/.*test_wopbs_bivariate_crt_wopbs_param_message_[34]_carry_[34]_ks_pbs_gaussian_2m64$/",
    "/.*test_integer_smart_mul_param_message_4_carry_4_ks_pbs_gaussian_2m64$/",
    "/.*test_integer_default_add_sequence_multi_thread_param_message_4_carry_4_ks_pbs_gaussian_2m64$/",
+    "/.*::tests_long_run::.*/",
 ]

 # skip default_div, default_rem which are covered by default_div_rem
@@ -94,55 +101,61 @@ EXCLUDED_BIG_PARAMETERS = [
    "/.*_param_message_4_carry_4_ks_pbs_gaussian_2m64$/",
 ]

-
 def filter_integer_tests(input_args):
    (multi_bit_filter, group_filter) = (
        ("_multi_bit", "_group_[0-9]") if input_args.multi_bit else ("", "")
    )
    backend_filter = ""
-    if input_args.backend == "gpu":
-        backend_filter = "gpu::"
-        if multi_bit_filter:
-            # For now, GPU only has specific parameters set for multi-bit
-            multi_bit_filter = "_gpu_multi_bit"
+    if not input_args.long_tests:
+        if input_args.backend == "gpu":
+            backend_filter = "gpu::"
+            if multi_bit_filter:
+                # For now, GPU only has specific parameters set for multi-bit
+                multi_bit_filter = "_gpu_multi_bit"

-    filter_expression = [f"test(/^integer::{backend_filter}.*/)"]
+        filter_expression = [f"test(/^integer::{backend_filter}.*/)"]

-    if input_args.multi_bit:
-        filter_expression.append("test(~_multi_bit)")
-    else:
-        filter_expression.append("not test(~_multi_bit)")
+        if input_args.multi_bit:
+            filter_expression.append("test(~_multi_bit)")
+        else:
+            filter_expression.append("not test(~_multi_bit)")

-    if input_args.signed_only:
-        filter_expression.append("test(~_signed)")
-    if input_args.unsigned_only:
-        filter_expression.append("not test(~_signed)")
+        if input_args.signed_only:
+            filter_expression.append("test(~_signed)")
+        if input_args.unsigned_only:
+            filter_expression.append("not test(~_signed)")

-    if input_args.no_big_params:
-        for pattern in EXCLUDED_BIG_PARAMETERS:
+        if input_args.no_big_params:
+            for pattern in EXCLUDED_BIG_PARAMETERS:
+                filter_expression.append(f"not test({pattern})")
+
+        if input_args.fast_tests and input_args.nightly_tests:
+            filter_expression.append(
+                f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_[2-3]_carry_[2-3]_.*/)"
+            )
+        elif input_args.fast_tests:
+            # Test only fast default operations with only one set of parameters
+            filter_expression.append(
+                f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_2_carry_2_.*/)"
+            )
+        elif input_args.nightly_tests:
+            # Test only fast default operations with only one set of parameters
+            # This subset would run slower than fast_tests hence the use of nightly_tests
+            filter_expression.append(
+                f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_3_carry_3_.*/)"
+            )
+        excluded_tests = (
+            EXCLUDED_INTEGER_FAST_TESTS if input_args.fast_tests else EXCLUDED_INTEGER_TESTS
+        )
+        for pattern in excluded_tests:
            filter_expression.append(f"not test({pattern})")

-    if input_args.fast_tests and input_args.nightly_tests:
-        filter_expression.append(
-            f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_[2-3]_carry_[2-3]_.*/)"
-        )
-    elif input_args.fast_tests:
-        # Test only fast default operations with only one set of parameters
-        filter_expression.append(
-            f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_2_carry_2_.*/)"
-        )
-    elif input_args.nightly_tests:
-        # Test only fast default operations with only one set of parameters
-        # This subset would run slower than fast_tests hence the use of nightly_tests
-        filter_expression.append(
-            f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_3_carry_3_.*/)"
-        )
+    else:
+        if input_args.backend == "gpu":
+            filter_expression = [f"test(/^integer::gpu::server_key::radix::tests_long_run.*/)"]
+        elif input_args.backend == "cpu":
+            filter_expression = [f"test(/^integer::server_key::radix_parallel::tests_long_run.*/)"]

-    excluded_tests = (
-        EXCLUDED_INTEGER_FAST_TESTS if input_args.fast_tests else EXCLUDED_INTEGER_TESTS
-    )
-    for pattern in excluded_tests:
-        filter_expression.append(f"not test({pattern})")

    return " and ".join(filter_expression)

--- a/tasks/Cargo.toml
+++ b/tasks/Cargo.toml
@@ -7,7 +7,6 @@ edition = "2021"

 [dependencies]
 clap = "=4.4.4"
-lazy_static = "1.4"
 log = "0.4"
 simplelog = "0.12"
 walkdir = "2.5.0"
--- a/tasks/src/check_tfhe_docs_are_tested.rs
+++ b/tasks/src/check_tfhe_docs_are_tested.rs
@@ -101,7 +101,7 @@ pub fn check_tfhe_docs_are_tested() -> Result<(), Error> {
        .into_iter()
        .filter_map(|entry| {
            let path = entry.path().canonicalize().ok()?;
-            if path.is_file() && path.extension().map_or(false, |e| e == "md") {
+            if path.is_file() && path.extension().is_some_and(|e| e == "md") {
                let file_content = std::fs::read_to_string(&path).ok()?;
                if file_content.contains("```rust") {
                    Some(path.to_path_buf())
--- a/tasks/src/main.rs
+++ b/tasks/src/main.rs
@@ -1,5 +1,4 @@
 use clap::{Arg, Command};
-use lazy_static::lazy_static;
 use log::LevelFilter;
 use simplelog::{ColorChoice, CombinedLogger, Config, TermLogger, TerminalMode};
 use std::sync::atomic::AtomicBool;
@@ -12,9 +11,8 @@ mod utils;
 // -------------------------------------------------------------------------------------------------
 // CONSTANTS
 // -------------------------------------------------------------------------------------------------
-lazy_static! {
-    static ref DRY_RUN: AtomicBool = AtomicBool::new(false);
-}
+
+static DRY_RUN: AtomicBool = AtomicBool::new(false);

 // -------------------------------------------------------------------------------------------------
 // MAIN
--- a/tfhe-csprng/Cargo.toml
+++ b/tfhe-csprng/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-csprng"
-version = "0.4.1"
+version = "0.5.0"
 edition = "2021"
 license = "BSD-3-Clause-Clear"
 description = "Cryptographically Secure PRNG used in the TFHE-rs library."
@@ -13,41 +13,25 @@ rust-version = "1.72"

 [dependencies]
 aes = "0.8.2"
-rayon = { version = "1.5.0", optional = true }
+rayon = { workspace = true , optional = true }

 [target.'cfg(target_os = "macos")'.dependencies]
 libc = "0.2.133"

 [dev-dependencies]
-rand = "0.8.3"
+rand = { workspace = true }
 criterion = "0.5.1"
 clap = "=4.4.4"

 [features]
 parallel = ["rayon"]
-seeder_x86_64_rdseed = []
-seeder_unix = []
-generator_x86_64_aesni = []
-generator_fallback = []
-generator_aarch64_aes = []
-
-x86_64 = [
-    "parallel",
-    "seeder_x86_64_rdseed",
-    "generator_x86_64_aesni",
-    "generator_fallback",
-]
-x86_64-unix = ["x86_64", "seeder_unix"]
-aarch64 = ["parallel", "generator_aarch64_aes", "generator_fallback"]
-aarch64-unix = ["aarch64", "seeder_unix"]
+software-prng = []

 [[bench]]
 name = "benchmark"
 path = "benches/benchmark.rs"
 harness = false
-required-features = ["seeder_x86_64_rdseed", "generator_x86_64_aesni"]

 [[example]]
 name = "generate"
 path = "examples/generate.rs"
-required-features = ["seeder_unix", "generator_fallback"]
--- a/tfhe-csprng/README.md
+++ b/tfhe-csprng/README.md
@@ -8,13 +8,13 @@ The implementation is based on the AES blockcipher used in CTR mode, as describe

 Two implementations are available, an accelerated one on x86_64 CPUs with the `aes` feature and the `sse2` feature, and a pure software one that can be used on other platforms.

-The crate also makes two seeders available, one needing the x86_64 feature `rdseed` and another one based on the Unix random device `/dev/random` the latter requires the user to provide a secret.
+The crate also makes two seeders available, one needing the x86_64 instruction `rdseed` and another one based on the Unix random device `/dev/random` the latter requires the user to provide a secret.

 ## Running the benchmarks

 To execute the benchmarks on an x86_64 platform:
 ```shell
-RUSTFLAGS="-Ctarget-cpu=native" cargo bench --features=seeder_x86_64_rdseed,generator_x86_64_aesni 
+RUSTFLAGS="-Ctarget-cpu=native" cargo bench
 ```

 ## License
--- a/tfhe-csprng/benches/benchmark.rs
+++ b/tfhe-csprng/benches/benchmark.rs
@@ -1,15 +1,53 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use tfhe_csprng::generators::{
-    AesniRandomGenerator, BytesPerChild, ChildrenCount, RandomGenerator,
+    BytesPerChild, ChildrenCount, DefaultRandomGenerator, RandomGenerator,
 };
-use tfhe_csprng::seeders::{RdseedSeeder, Seeder};
+#[cfg(target_os = "macos")]
+use tfhe_csprng::seeders::AppleSecureEnclaveSeeder as ActivatedSeeder;
+#[cfg(all(
+    not(target_os = "macos"),
+    target_arch = "x86_64",
+    target_feature = "rdseed"
+))]
+use tfhe_csprng::seeders::RdseedSeeder as ActivatedSeeder;
+#[cfg(all(
+    not(target_os = "macos"),
+    not(all(target_arch = "x86_64", target_feature = "rdseed")),
+    target_family = "unix"
+))]
+use tfhe_csprng::seeders::UnixSeeder as ActivatedSeeder;
+
+use tfhe_csprng::seeders::Seeder;

 // The number of bytes to generate during one benchmark iteration.
 const N_GEN: usize = 1_000_000;

+fn new_seeder() -> ActivatedSeeder {
+    #[cfg(target_os = "macos")]
+    {
+        ActivatedSeeder
+    }
+    #[cfg(all(
+        not(target_os = "macos"),
+        target_arch = "x86_64",
+        target_feature = "rdseed"
+    ))]
+    {
+        ActivatedSeeder::new()
+    }
+    #[cfg(all(
+        not(target_os = "macos"),
+        not(all(target_arch = "x86_64", target_feature = "rdseed")),
+        target_family = "unix"
+    ))]
+    {
+        ActivatedSeeder::new(0)
+    }
+}
+
 fn parent_generate(c: &mut Criterion) {
-    let mut seeder = RdseedSeeder;
-    let mut generator = AesniRandomGenerator::new(seeder.seed());
+    let mut seeder = new_seeder();
+    let mut generator = DefaultRandomGenerator::new(seeder.seed());
    c.bench_function("parent_generate", |b| {
        b.iter(|| {
            (0..N_GEN).for_each(|_| {
@@ -20,8 +58,8 @@ fn parent_generate(c: &mut Criterion) {
 }

 fn child_generate(c: &mut Criterion) {
-    let mut seeder = RdseedSeeder;
-    let mut generator = AesniRandomGenerator::new(seeder.seed());
+    let mut seeder = new_seeder();
+    let mut generator = DefaultRandomGenerator::new(seeder.seed());
    let mut generator = generator
        .try_fork(ChildrenCount(1), BytesPerChild(N_GEN * 10_000))
        .unwrap()
@@ -37,8 +75,8 @@ fn child_generate(c: &mut Criterion) {
 }

 fn fork(c: &mut Criterion) {
-    let mut seeder = RdseedSeeder;
-    let mut generator = AesniRandomGenerator::new(seeder.seed());
+    let mut seeder = new_seeder();
+    let mut generator = DefaultRandomGenerator::new(seeder.seed());
    c.bench_function("fork", |b| {
        b.iter(|| {
            black_box(
--- a/tfhe-csprng/build.rs
+++ b/tfhe-csprng/build.rs
@@ -1,115 +0,0 @@
-// To have clear error messages during compilation about why some piece of code may not be available
-// we decided to check the features compatibility with the target configuration in this script.
-
-use std::collections::HashMap;
-use std::env;
-
-// See https://doc.rust-lang.org/reference/conditional-compilation.html#target_arch for various
-// compilation configuration
-
-// Can be easily extended if needed
-pub struct FeatureRequirement {
-    pub feature_name: &'static str,
-    // target_arch requirement
-    pub feature_req_target_arch: Option<&'static str>,
-    // target_family requirement
-    pub feature_req_target_family: Option<&'static str>,
-}
-
-// We implement a version of default that is const which is not possible through the Default trait
-impl FeatureRequirement {
-    // As we cannot use cfg!(feature = "feature_name") with something else than a literal, we need
-    // a reference to the HashMap we populate with the enabled features
-    fn is_activated(&self, build_activated_features: &HashMap<&'static str, bool>) -> bool {
-        *build_activated_features.get(self.feature_name).unwrap()
-    }
-
-    // panics if the requirements are not met
-    fn check_requirements(&self) {
-        let target_arch = get_target_arch_cfg();
-        if let Some(feature_req_target_arch) = self.feature_req_target_arch {
-            if feature_req_target_arch != target_arch {
-                panic!(
-                    "Feature `{}` requires target_arch `{}`, current cfg: `{}`",
-                    self.feature_name, feature_req_target_arch, target_arch
-                )
-            }
-        }
-
-        let target_families = get_target_family_cfgs();
-        if let Some(feature_req_target_family) = self.feature_req_target_family {
-            if target_families
-                .split(',')
-                .all(|family| family != feature_req_target_family)
-            {
-                panic!(
-                    "Feature `{}` requires target_family `{}`, current cfgs: `{}`",
-                    self.feature_name, feature_req_target_family, target_families
-                )
-            }
-        }
-    }
-}
-
-// const vecs are not yet a thing so use a fixed size array (update the array size when adding
-// requirements)
-static FEATURE_REQUIREMENTS: [FeatureRequirement; 4] = [
-    FeatureRequirement {
-        feature_name: "seeder_x86_64_rdseed",
-        feature_req_target_arch: Some("x86_64"),
-        feature_req_target_family: None,
-    },
-    FeatureRequirement {
-        feature_name: "generator_x86_64_aesni",
-        feature_req_target_arch: Some("x86_64"),
-        feature_req_target_family: None,
-    },
-    FeatureRequirement {
-        feature_name: "seeder_unix",
-        feature_req_target_arch: None,
-        feature_req_target_family: Some("unix"),
-    },
-    FeatureRequirement {
-        feature_name: "generator_aarch64_aes",
-        feature_req_target_arch: Some("aarch64"),
-        feature_req_target_family: None,
-    },
-];
-
-// For a "feature_name" feature_cfg!("feature_name") expands to
-// ("feature_name", cfg!(feature = "feature_name"))
-macro_rules! feature_cfg {
-    ($feat_name:literal) => {
-        ($feat_name, cfg!(feature = $feat_name))
-    };
-}
-
-// Static HashMap would require an additional crate (phf or lazy static e.g.), so we just write a
-// function that returns the HashMap we are interested in
-fn get_feature_enabled_status() -> HashMap<&'static str, bool> {
-    HashMap::from([
-        feature_cfg!("seeder_x86_64_rdseed"),
-        feature_cfg!("generator_x86_64_aesni"),
-        feature_cfg!("seeder_unix"),
-        feature_cfg!("generator_aarch64_aes"),
-    ])
-}
-
-// See https://stackoverflow.com/a/43435335/18088947 for the inspiration of this code
-fn get_target_arch_cfg() -> String {
-    env::var("CARGO_CFG_TARGET_ARCH").expect("CARGO_CFG_TARGET_ARCH is not set")
-}
-
-fn get_target_family_cfgs() -> String {
-    env::var("CARGO_CFG_TARGET_FAMILY").expect("CARGO_CFG_TARGET_FAMILY is not set")
-}
-
-fn main() {
-    let feature_enabled_status = get_feature_enabled_status();
-
-    // This will panic if some requirements for a feature are not met
-    FEATURE_REQUIREMENTS
-        .iter()
-        .filter(|&req| FeatureRequirement::is_activated(req, &feature_enabled_status))
-        .for_each(FeatureRequirement::check_requirements);
-}
--- a/tfhe-csprng/examples/generate.rs
+++ b/tfhe-csprng/examples/generate.rs
@@ -2,35 +2,29 @@
 //! the program stdout. It can also generate a fixed number of bytes by passing a value along the
 //! optional argument `--bytes_total`. For testing purpose.
 use clap::{value_parser, Arg, Command};
-#[cfg(feature = "generator_x86_64_aesni")]
-use tfhe_csprng::generators::AesniRandomGenerator as ActivatedRandomGenerator;
-#[cfg(feature = "generator_aarch64_aes")]
-use tfhe_csprng::generators::NeonAesRandomGenerator as ActivatedRandomGenerator;
-use tfhe_csprng::generators::RandomGenerator;
-#[cfg(all(
-    not(feature = "generator_x86_64_aesni"),
-    not(feature = "generator_aarch64_aes"),
-    feature = "generator_fallback"
-))]
-use tfhe_csprng::generators::SoftwareRandomGenerator as ActivatedRandomGenerator;
+use tfhe_csprng::generators::{DefaultRandomGenerator, RandomGenerator};

 use std::io::prelude::*;
 use std::io::{stdout, StdoutLock};
 #[cfg(target_os = "macos")]
 use tfhe_csprng::seeders::AppleSecureEnclaveSeeder as ActivatedSeeder;
-#[cfg(all(not(target_os = "macos"), feature = "seeder_x86_64_rdseed"))]
+#[cfg(all(
+    not(target_os = "macos"),
+    target_arch = "x86_64",
+    target_feature = "rdseed"
+))]
 use tfhe_csprng::seeders::RdseedSeeder as ActivatedSeeder;
 use tfhe_csprng::seeders::Seeder;
 #[cfg(all(
    not(target_os = "macos"),
-    not(feature = "seeder_x86_64_rdseed"),
-    feature = "seeder_unix"
+    not(all(target_arch = "x86_64", target_feature = "rdseed")),
+    target_family = "unix"
 ))]
 use tfhe_csprng::seeders::UnixSeeder as ActivatedSeeder;

 fn write_bytes(
    buffer: &mut [u8],
-    generator: &mut ActivatedRandomGenerator,
+    generator: &mut DefaultRandomGenerator,
    stdout: &mut StdoutLock<'_>,
 ) -> std::io::Result<()> {
    buffer.iter_mut().zip(generator).for_each(|(b, g)| *b = g);
@@ -39,7 +33,7 @@ fn write_bytes(

 fn infinite_bytes_generation(
    buffer: &mut [u8],
-    generator: &mut ActivatedRandomGenerator,
+    generator: &mut DefaultRandomGenerator,
    stdout: &mut StdoutLock<'_>,
 ) {
    while write_bytes(buffer, generator, stdout).is_ok() {}
@@ -48,7 +42,7 @@ fn infinite_bytes_generation(
 fn bytes_generation(
    bytes_total: usize,
    buffer: &mut [u8],
-    generator: &mut ActivatedRandomGenerator,
+    generator: &mut DefaultRandomGenerator,
    stdout: &mut StdoutLock<'_>,
 ) {
    let quotient = bytes_total / buffer.len();
@@ -61,6 +55,29 @@ fn bytes_generation(
    write_bytes(&mut buffer[0..remaining], generator, stdout).unwrap()
 }

+fn new_seeder() -> ActivatedSeeder {
+    #[cfg(target_os = "macos")]
+    {
+        ActivatedSeeder
+    }
+    #[cfg(all(
+        not(target_os = "macos"),
+        target_arch = "x86_64",
+        target_feature = "rdseed"
+    ))]
+    {
+        ActivatedSeeder::new()
+    }
+    #[cfg(all(
+        not(target_os = "macos"),
+        not(all(target_arch = "x86_64", target_feature = "rdseed")),
+        target_family = "unix"
+    ))]
+    {
+        ActivatedSeeder::new(0)
+    }
+}
+
 pub fn main() {
    let matches = Command::new(
        "Generate a stream of random numbers, specify no flags for infinite generation",
@@ -74,25 +91,11 @@ pub fn main() {
    )
    .get_matches();

-    // Ugly hack to be able to use UnixSeeder
-    #[cfg(all(
-        not(target_os = "macos"),
-        not(feature = "seeder_x86_64_rdseed"),
-        feature = "seeder_unix"
-    ))]
-    let new_seeder = || ActivatedSeeder::new(0);
-    #[cfg(not(all(
-        not(target_os = "macos"),
-        not(feature = "seeder_x86_64_rdseed"),
-        feature = "seeder_unix"
-    )))]
-    let new_seeder = || ActivatedSeeder;
-
    let mut seeder = new_seeder();
    let seed = seeder.seed();
    // Don't print on std out
    eprintln!("seed={seed:?}");
-    let mut generator = ActivatedRandomGenerator::new(seed);
+    let mut generator = DefaultRandomGenerator::new(seed);
    let stdout = stdout();
    let mut buffer = [0u8; 16];

--- a/tfhe-csprng/src/generators/aes_ctr/mod.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/mod.rs
@@ -206,7 +206,6 @@ pub use index::*;

 /// A module containing structures to manage table indices and buffer pointers together properly.
 mod states;
-pub use states::*;

 /// A module containing an abstraction for aes block ciphers.
 mod block_cipher;
--- a/tfhe-csprng/src/generators/aes_ctr/parallel.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/parallel.rs
@@ -1,6 +1,5 @@
-use crate::generators::aes_ctr::{
-    AesBlockCipher, AesCtrGenerator, ChildrenClosure, State, TableIndex,
-};
+use crate::generators::aes_ctr::states::State;
+use crate::generators::aes_ctr::{AesBlockCipher, AesCtrGenerator, ChildrenClosure, TableIndex};
 use crate::generators::{BytesPerChild, ChildrenCount, ForkError};

 /// A type alias for the parallel children iterator type.
--- a/tfhe-csprng/src/generators/default.rs
+++ b/tfhe-csprng/src/generators/default.rs
@@ -0,0 +1,9 @@
+#[cfg(all(target_arch = "x86_64", not(feature = "software-prng")))]
+pub type DefaultRandomGenerator = super::AesniRandomGenerator;
+#[cfg(all(target_arch = "aarch64", not(feature = "software-prng")))]
+pub type DefaultRandomGenerator = super::NeonAesRandomGenerator;
+#[cfg(any(
+    feature = "software-prng",
+    not(any(target_arch = "x86_64", target_arch = "aarch64"))
+))]
+pub type DefaultRandomGenerator = super::SoftwareRandomGenerator;
--- a/tfhe-csprng/src/generators/implem/aarch64/block_cipher.rs
+++ b/tfhe-csprng/src/generators/implem/aarch64/block_cipher.rs
@@ -25,7 +25,8 @@ impl AesBlockCipher for ArmAesBlockCipher {
        if !(aes_detected && neon_detected) {
            panic!(
                "The ArmAesBlockCipher requires both aes and neon aarch64 CPU features.\n\
-                aes feature available: {}\nneon feature available: {}\n.",
+                aes feature available: {}\nneon feature available: {}\n\
+                Please consider enabling the SoftwareRandomGenerator with the `software-prng` feature",
                aes_detected, neon_detected
            )
        }
--- a/tfhe-csprng/src/generators/implem/aesni/block_cipher.rs
+++ b/tfhe-csprng/src/generators/implem/aesni/block_cipher.rs
@@ -20,7 +20,8 @@ impl AesBlockCipher for AesniBlockCipher {
        if !(aes_detected && sse2_detected) {
            panic!(
                "The AesniBlockCipher requires both aes and sse2 x86 CPU features.\n\
-                aes feature available: {}\nsse2 feature available: {}\n.",
+                aes feature available: {}\nsse2 feature available: {}\n\
+                Please consider enabling the SoftwareRandomGenerator with the `software-prng` feature",
                aes_detected, sse2_detected
            )
        }
--- a/tfhe-csprng/src/generators/implem/mod.rs
+++ b/tfhe-csprng/src/generators/implem/mod.rs
@@ -1,14 +1,12 @@
-#[cfg(feature = "generator_x86_64_aesni")]
+#[cfg(target_arch = "x86_64")]
 mod aesni;
-#[cfg(feature = "generator_x86_64_aesni")]
+#[cfg(target_arch = "x86_64")]
 pub use aesni::*;

-#[cfg(feature = "generator_aarch64_aes")]
+#[cfg(target_arch = "aarch64")]
 mod aarch64;
-#[cfg(feature = "generator_aarch64_aes")]
+#[cfg(target_arch = "aarch64")]
 pub use aarch64::*;

-#[cfg(feature = "generator_fallback")]
 mod soft;
-#[cfg(feature = "generator_fallback")]
 pub use soft::*;
--- a/tfhe-csprng/src/generators/mod.rs
+++ b/tfhe-csprng/src/generators/mod.rs
@@ -123,6 +123,10 @@ mod aes_ctr;
 mod implem;
 pub use implem::*;

+pub mod default;
+/// Convenience alias for the most efficient CSPRNG implementation available.
+pub use default::DefaultRandomGenerator;
+
 #[cfg(test)]
 #[allow(unused)] // to please clippy when tests are not activated
 pub mod generator_generic_test {
--- a/tfhe-csprng/src/seeders/implem/mod.rs
+++ b/tfhe-csprng/src/seeders/implem/mod.rs
@@ -3,12 +3,12 @@ mod apple_secure_enclave_seeder;
 #[cfg(target_os = "macos")]
 pub use apple_secure_enclave_seeder::AppleSecureEnclaveSeeder;

-#[cfg(feature = "seeder_x86_64_rdseed")]
+#[cfg(target_arch = "x86_64")]
 mod rdseed;
-#[cfg(feature = "seeder_x86_64_rdseed")]
+#[cfg(target_arch = "x86_64")]
 pub use rdseed::RdseedSeeder;

-#[cfg(feature = "seeder_unix")]
+#[cfg(target_family = "unix")]
 mod unix;
-#[cfg(feature = "seeder_unix")]
+#[cfg(target_family = "unix")]
 pub use unix::UnixSeeder;
--- a/tfhe-csprng/src/seeders/implem/rdseed.rs
+++ b/tfhe-csprng/src/seeders/implem/rdseed.rs
@@ -4,7 +4,23 @@ use crate::seeders::{Seed, Seeder};
 ///
 /// The `rdseed` instruction allows to deliver seeds from a hardware source of entropy see
 /// <https://www.felixcloutier.com/x86/rdseed> .
-pub struct RdseedSeeder;
+pub struct RdseedSeeder(());
+
+impl RdseedSeeder {
+    pub fn new() -> Self {
+        if Self::is_available() {
+            Self(())
+        } else {
+            panic!("Tried to use RdSeedSeeder but rdseed instruction is not enabled on the current machine");
+        }
+    }
+}
+
+impl Default for RdseedSeeder {
+    fn default() -> Self {
+        Self::new()
+    }
+}

 impl Seeder for RdseedSeeder {
    fn seed(&mut self) -> Seed {
@@ -46,6 +62,6 @@ mod test {

    #[test]
    fn check_bounded_sequence_difference() {
-        check_seeder_fixed_sequences_different(|_| RdseedSeeder);
+        check_seeder_fixed_sequences_different(|_| RdseedSeeder::new());
    }
 }
--- a/tfhe-fft/Cargo.toml
+++ b/tfhe-fft/Cargo.toml
@@ -29,7 +29,7 @@ serde = ["dep:serde", "num-complex/serde"]

 [dev-dependencies]
 rustfft = "6.0"
-rand = "0.8"
+rand = { workspace = true }
 bincode = "1.3"
 more-asserts = "0.3.1"
 serde_json = "1.0.96"
--- a/tfhe-fft/README.md
+++ b/tfhe-fft/README.md
@@ -40,7 +40,7 @@ Additionally, an optional 128-bit negacyclic FFT module is provided.
 ```rust
 use tfhe_fft::c64;
 use tfhe_fft::ordered::{Method, Plan};
-use dyn_stack::{GlobalPodBuffer, PodStack, ReborrowMut};
+use dyn_stack::{GlobalPodBuffer, PodStack};
 use num_complex::ComplexFloat;
 use std::time::Duration;

@@ -48,7 +48,7 @@ fn main() {
    const N: usize = 4;
    let plan = Plan::new(4, Method::Measure(Duration::from_millis(10)));
    let mut scratch_memory = GlobalPodBuffer::new(plan.fft_scratch().unwrap());
-    let mut stack = PodStack::new(&mut scratch_memory);
+    let stack = PodStack::new(&mut scratch_memory);

    let data = [
        c64::new(1.0, 0.0),
@@ -58,10 +58,10 @@ fn main() {
    ];

    let mut transformed_fwd = data;
-    plan.fwd(&mut transformed_fwd, stack.rb_mut());
+    plan.fwd(&mut transformed_fwd, stack);

    let mut transformed_inv = transformed_fwd;
-    plan.inv(&mut transformed_inv, stack.rb_mut());
+    plan.inv(&mut transformed_inv, stack);

    for (actual, expected) in transformed_inv.iter().map(|z| z / N as f64).zip(data) {
        assert!((expected - actual).abs() < 1e-9);
--- a/tfhe-fft/benches/fft.rs
+++ b/tfhe-fft/benches/fft.rs
@@ -1,6 +1,6 @@
 use core::ptr::NonNull;
 use criterion::{criterion_group, criterion_main, Criterion};
-use dyn_stack::{PodStack, ReborrowMut, StackReq};
+use dyn_stack::{PodStack, StackReq};
 use serde::Serialize;
 use std::{fs, path::PathBuf};
 use tfhe_fft::c64;
@@ -129,7 +129,7 @@ pub fn bench_ffts(c: &mut Criterion) {
            StackReq::new_aligned::<c64>(n, 256),     // src
            StackReq::new_aligned::<c64>(n, 256),     // dst
        ]));
-        let mut stack = PodStack::new(&mut mem);
+        let stack = PodStack::new(&mut mem);
        let z = c64::new(0.0, 0.0);

        use rustfft::FftPlannerAvx;
@@ -139,8 +139,8 @@ pub fn bench_ffts(c: &mut Criterion) {
        let unordered =
            tfhe_fft::unordered::Plan::new(n, tfhe_fft::unordered::Method::Measure(bench_duration));

-        let (dst, stack) = stack.rb_mut().make_aligned_with::<c64, _>(n, 64, |_| z);
-        let (src, mut stack) = stack.make_aligned_with::<c64, _>(n, 64, |_| z);
+        let (dst, stack) = stack.make_aligned_with::<c64>(n, 64, |_| z);
+        let (src, stack) = stack.make_aligned_with::<c64>(n, 64, |_| z);

        let bench_id = format!("rustfft-fwd-{n}");
        c.bench_function(&bench_id, |b| {
@@ -164,19 +164,19 @@ pub fn bench_ffts(c: &mut Criterion) {
                tfhe_fft::ordered::Plan::new(n, tfhe_fft::ordered::Method::Measure(bench_duration));

            let bench_id = format!("tfhe-ordered-fwd-{n}");
-            c.bench_function(&bench_id, |b| b.iter(|| ordered.fwd(dst, stack.rb_mut())));
+            c.bench_function(&bench_id, |b| b.iter(|| ordered.fwd(dst, stack)));
            write_to_json(&bench_id, "tfhe-ordered-fwd", n);
        }

        let bench_id = format!("tfhe-unordered-fwd-{n}");
        c.bench_function(&bench_id, |b| {
-            b.iter(|| unordered.fwd(dst, stack.rb_mut()));
+            b.iter(|| unordered.fwd(dst, stack));
        });
        write_to_json(&bench_id, "tfhe-unordered-fwd", n);

        let bench_id = format!("tfhe-unordered-inv-{n}");
        c.bench_function(&bench_id, |b| {
-            b.iter(|| unordered.inv(dst, stack.rb_mut()));
+            b.iter(|| unordered.inv(dst, stack));
        });
        write_to_json(&bench_id, "tfhe-unordered-inv", n);

--- a/tfhe-fft/src/fft128/f128_ops.rs
+++ b/tfhe-fft/src/fft128/f128_ops.rs
@@ -645,7 +645,7 @@ pub mod x86 {

    #[inline(always)]
    pub(crate) fn two_diff_f64x4(simd: V3, a: f64x4, b: f64x4) -> (f64x4, f64x4) {
-        two_sum_f64x4(simd, a, simd.f64s_neg(b))
+        two_sum_f64x4(simd, a, simd.neg_f64s(b))
    }

    #[inline(always)]
@@ -677,7 +677,7 @@ pub mod x86 {
    #[inline(always)]
    #[cfg(feature = "nightly")]
    pub(crate) fn two_diff_f64x8(simd: V4, a: f64x8, b: f64x8) -> (f64x8, f64x8) {
-        two_sum_f64x8(simd, a, simd.f64s_neg(b))
+        two_sum_f64x8(simd, a, simd.neg_f64s(b))
    }

    #[cfg(feature = "nightly")]
@@ -714,8 +714,8 @@ pub mod x86 {
            simd,
            a,
            f64x16 {
-                lo: simd.f64s_neg(b.lo),
-                hi: simd.f64s_neg(b.hi),
+                lo: simd.neg_f64s(b.lo),
+                hi: simd.neg_f64s(b.hi),
            },
        )
    }
--- a/tfhe-fft/src/lib.rs
+++ b/tfhe-fft/src/lib.rs
@@ -36,14 +36,14 @@
 #![cfg_attr(not(feature = "std"), doc = "```ignore")]
 //! use tfhe_fft::c64;
 //! use tfhe_fft::ordered::{Plan, Method};
-//! use dyn_stack::{PodStack, GlobalPodBuffer, ReborrowMut};
+//! use dyn_stack::{PodStack, GlobalPodBuffer};
 //! use num_complex::ComplexFloat;
 //! use std::time::Duration;
 //!
 //! const N: usize = 4;
 //! let plan = Plan::new(4, Method::Measure(Duration::from_millis(10)));
 //! let mut scratch_memory = GlobalPodBuffer::new(plan.fft_scratch().unwrap());
-//! let mut stack = PodStack::new(&mut scratch_memory);
+//! let stack = PodStack::new(&mut scratch_memory);
 //!
 //! let data = [
 //!     c64::new(1.0, 0.0),
@@ -53,10 +53,10 @@
 //! ];
 //!
 //! let mut transformed_fwd = data;
-//! plan.fwd(&mut transformed_fwd, stack.rb_mut());
+//! plan.fwd(&mut transformed_fwd, stack);
 //!
 //! let mut transformed_inv = transformed_fwd;
-//! plan.inv(&mut transformed_inv, stack.rb_mut());
+//! plan.inv(&mut transformed_inv, stack);
 //!
 //! for (actual, expected) in transformed_inv.iter().map(|z| z / N as f64).zip(data) {
 //!     assert!((expected - actual).abs() < 1e-9);
--- a/tfhe-fft/src/ordered.rs
+++ b/tfhe-fft/src/ordered.rs
@@ -16,7 +16,7 @@ use aligned_vec::{avec, ABox, CACHELINE_ALIGN};
 #[cfg(feature = "std")]
 use core::time::Duration;
 #[cfg(feature = "std")]
-use dyn_stack::{GlobalPodBuffer, ReborrowMut};
+use dyn_stack::GlobalPodBuffer;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};

 /// Internal FFT algorithm.
@@ -65,7 +65,7 @@ fn measure_n_runs(
    buf: &mut [c64],
    twiddles_init: &[c64],
    twiddles: &[c64],
-    stack: PodStack,
+    stack: &mut PodStack,
 ) -> Duration {
    let n = buf.len();
    let (scratch, _) = stack.make_aligned_raw::<c64>(n, CACHELINE_ALIGN);
@@ -99,7 +99,7 @@ pub(crate) fn measure_fastest_scratch(n: usize) -> StackReq {
 pub(crate) fn measure_fastest(
    min_bench_duration_per_algo: Duration,
    n: usize,
-    stack: PodStack,
+    stack: &mut PodStack,
 ) -> (FftAlgo, Duration) {
    const N_ALGOS: usize = 8;
    const MIN_DURATION: Duration = if cfg!(target_arch = "wasm32") {
@@ -116,14 +116,14 @@ pub(crate) fn measure_fastest(

    let f = |_| c64 { re: 0.0, im: 0.0 };

-    let (twiddles, stack) = stack.make_aligned_with::<c64, _>(2 * n, align, f);
+    let (twiddles, stack) = stack.make_aligned_with::<c64>(2 * n, align, f);
    let twiddles_init = &twiddles[..n];
    let twiddles = &twiddles[n..];
-    let (buf, mut stack) = stack.make_aligned_with::<c64, _>(n, align, f);
+    let (buf, stack) = stack.make_aligned_with::<c64>(n, align, f);

    {
        // initialize scratch to load it in the cpu cache
-        drop(stack.rb_mut().make_aligned_with::<c64, _>(n, align, f));
+        drop(stack.make_aligned_with::<c64>(n, align, f));
    }

    let mut avg_durations = [Duration::ZERO; N_ALGOS];
@@ -149,8 +149,7 @@ pub(crate) fn measure_fastest(
            let mut n_runs: u128 = 1;

            loop {
-                let duration =
-                    measure_n_runs(n_runs, algo, buf, twiddles_init, twiddles, stack.rb_mut());
+                let duration = measure_n_runs(n_runs, algo, buf, twiddles_init, twiddles, stack);

                if duration < MIN_DURATION {
                    n_runs *= 2;
@@ -165,8 +164,7 @@ pub(crate) fn measure_fastest(
        *avg = if n_runs <= init_n_runs {
            approx_duration
        } else {
-            let duration =
-                measure_n_runs(n_runs, algo, buf, twiddles_init, twiddles, stack.rb_mut());
+            let duration = measure_n_runs(n_runs, algo, buf, twiddles_init, twiddles, stack);
            duration_div_f64(duration, n_runs as f64)
        };
    }
@@ -339,7 +337,7 @@ impl Plan {
    /// let mut buf = [c64::default(); 4];
    /// plan.fwd(&mut buf, stack);
    /// ```
-    pub fn fwd(&self, buf: &mut [c64], stack: PodStack) {
+    pub fn fwd(&self, buf: &mut [c64], stack: &mut PodStack) {
        let n = self.fft_size();
        let (scratch, _) = stack.make_aligned_raw::<c64>(n, CACHELINE_ALIGN);
        let (w_init, w) = split_2(&self.twiddles);
@@ -353,19 +351,19 @@ impl Plan {
    #[cfg_attr(not(feature = "std"), doc = " ```ignore")]
    /// use tfhe_fft::c64;
    /// use tfhe_fft::ordered::{Method, Plan};
-    /// use dyn_stack::{PodStack, GlobalPodBuffer, ReborrowMut};
+    /// use dyn_stack::{PodStack, GlobalPodBuffer};
    /// use core::time::Duration;
    ///
    /// let plan = Plan::new(4, Method::Measure(Duration::from_millis(10)));
    ///
    /// let mut memory = GlobalPodBuffer::new(plan.fft_scratch().unwrap());
-    /// let mut stack = PodStack::new(&mut memory);
+    /// let stack = PodStack::new(&mut memory);
    ///
    /// let mut buf = [c64::default(); 4];
-    /// plan.fwd(&mut buf, stack.rb_mut());
+    /// plan.fwd(&mut buf, stack);
    /// plan.inv(&mut buf, stack);
    /// ```
-    pub fn inv(&self, buf: &mut [c64], stack: PodStack) {
+    pub fn inv(&self, buf: &mut [c64], stack: &mut PodStack) {
        let n = self.fft_size();
        let (scratch, _) = stack.make_aligned_raw::<c64>(n, CACHELINE_ALIGN);
        let (w_init, w) = split_2(&self.twiddles_inv);
--- a/tfhe-fft/src/unordered.rs
+++ b/tfhe-fft/src/unordered.rs
@@ -18,7 +18,7 @@ use aligned_vec::{avec, ABox, CACHELINE_ALIGN};
 #[cfg(feature = "std")]
 use core::time::Duration;
 #[cfg(feature = "std")]
-use dyn_stack::{GlobalPodBuffer, ReborrowMut};
+use dyn_stack::GlobalPodBuffer;
 use dyn_stack::{PodStack, SizeOverflow, StackReq};

 #[inline(always)]
@@ -553,7 +553,7 @@ fn measure_fastest_scratch(n: usize) -> StackReq {
 fn measure_fastest(
    mut min_bench_duration_per_algo: Duration,
    n: usize,
-    mut stack: PodStack,
+    stack: &mut PodStack,
 ) -> (FftAlgo, usize, Duration) {
    const MIN_DURATION: Duration = Duration::from_millis(1);
    min_bench_duration_per_algo = min_bench_duration_per_algo.max(MIN_DURATION);
@@ -581,11 +581,8 @@ fn measure_fastest(
            n_algos += 1;

            // we'll measure the corresponding plan
-            let (base_algo, duration) = crate::ordered::measure_fastest(
-                min_bench_duration_per_algo,
-                base_n,
-                stack.rb_mut(),
-            );
+            let (base_algo, duration) =
+                crate::ordered::measure_fastest(min_bench_duration_per_algo, base_n, stack);

            algos[i] = Some(base_algo);

@@ -599,11 +596,9 @@ fn measure_fastest(

            let f = |_| c64 { re: 0.0, im: 0.0 };
            let align = CACHELINE_ALIGN;
-            let (w, stack) = stack
-                .rb_mut()
-                .make_aligned_with::<c64, _>(n + base_n, align, f);
-            let (scratch, stack) = stack.make_aligned_with::<c64, _>(base_n, align, f);
-            let (z, _) = stack.make_aligned_with::<c64, _>(n, align, f);
+            let (w, stack) = stack.make_aligned_with::<c64>(n + base_n, align, f);
+            let (scratch, stack) = stack.make_aligned_with::<c64>(base_n, align, f);
+            let (z, _) = stack.make_aligned_with::<c64>(n, align, f);

            let n_runs = min_bench_duration_per_algo.as_secs_f64()
                / (duration.as_secs_f64() * (n / base_n) as f64);
@@ -823,7 +818,7 @@ impl Plan {
    /// let mut buf = [c64::default(); 4];
    /// plan.fwd(&mut buf, stack);
    /// ```
-    pub fn fwd(&self, buf: &mut [c64], stack: PodStack) {
+    pub fn fwd(&self, buf: &mut [c64], stack: &mut PodStack) {
        assert_eq!(self.fft_size(), buf.len());
        let (scratch, _) = stack.make_aligned_raw::<c64>(self.algo().1, CACHELINE_ALIGN);
        fwd_depth(
@@ -912,19 +907,19 @@ impl Plan {
    #[cfg_attr(not(feature = "std"), doc = " ```ignore")]
    /// use tfhe_fft::c64;
    /// use tfhe_fft::unordered::{Method, Plan};
-    /// use dyn_stack::{PodStack, GlobalPodBuffer, ReborrowMut};
+    /// use dyn_stack::{PodStack, GlobalPodBuffer};
    /// use core::time::Duration;
    ///
    /// let plan = Plan::new(4, Method::Measure(Duration::from_millis(10)));
    ///
    /// let mut memory = GlobalPodBuffer::new(plan.fft_scratch().unwrap());
-    /// let mut stack = PodStack::new(&mut memory);
+    /// let stack = PodStack::new(&mut memory);
    ///
    /// let mut buf = [c64::default(); 4];
-    /// plan.fwd(&mut buf, stack.rb_mut());
+    /// plan.fwd(&mut buf, stack);
    /// plan.inv(&mut buf, stack);
    /// ```
-    pub fn inv(&self, buf: &mut [c64], stack: PodStack) {
+    pub fn inv(&self, buf: &mut [c64], stack: &mut PodStack) {
        assert_eq!(self.fft_size(), buf.len());
        let (scratch, _) = stack.make_aligned_raw::<c64>(self.algo().1, CACHELINE_ALIGN);
        inv_depth(
@@ -995,7 +990,7 @@ impl Plan {
            base_n: usize,
        }

-        impl<'de, 'a> Visitor<'de> for SeqVisitor<'a> {
+        impl<'de> Visitor<'de> for SeqVisitor<'_> {
            type Value = ();

            fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
@@ -1062,7 +1057,7 @@ fn bit_rev_twice_inv(nbits: u32, base_nbits: u32, i: usize) -> usize {
 mod tests {
    use super::*;
    use alloc::vec;
-    use dyn_stack::{GlobalPodBuffer, ReborrowMut};
+    use dyn_stack::GlobalPodBuffer;
    use num_complex::ComplexFloat;
    use rand::random;

@@ -1157,8 +1152,8 @@ mod tests {
                },
            );
            let mut mem = GlobalPodBuffer::new(plan.fft_scratch().unwrap());
-            let mut stack = PodStack::new(&mut mem);
-            plan.fwd(&mut z, stack.rb_mut());
+            let stack = PodStack::new(&mut mem);
+            plan.fwd(&mut z, stack);
            plan.inv(&mut z, stack);

            for z in &mut z {
@@ -9400,7 +9395,7 @@ mod tests {
 mod tests_serde {
    use super::*;
    use alloc::{vec, vec::Vec};
-    use dyn_stack::{GlobalPodBuffer, ReborrowMut};
+    use dyn_stack::GlobalPodBuffer;
    use num_complex::ComplexFloat;
    use rand::random;

@@ -9440,9 +9435,9 @@ mod tests_serde {
                    .unwrap()
                    .or(plan2.fft_scratch().unwrap()),
            );
-            let mut stack = PodStack::new(&mut mem);
+            let stack = PodStack::new(&mut mem);

-            plan1.fwd(&mut z, stack.rb_mut());
+            plan1.fwd(&mut z, stack);

            let mut buf = Vec::<u8>::new();
            let mut serializer = bincode::Serializer::new(&mut buf, bincode::options());
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Pedro Alves	f9e94d9b6b	chore(gpu): replace cudaStreamCaptureModeGlobal by cudaStreamCaptureModeThreadLocal to avoid CPU multi-thread issues	2025-01-06 18:01:21 -03:00
Pedro Alves	d66e36b529	feat(gpu): implement CUDA Graph to accelerate default classical and multibit PBS	2025-01-06 17:29:48 -03:00
Agnes Leroy	33ca2c2fab	chore(gpu): update multi-bit params, add noise test for the classical & multi-bit PBS	2025-01-06 18:17:27 +01:00
Agnes Leroy	b22e369166	chore(ci): switch gpu tests on push to l40 and deactivate fast pks	2025-01-06 16:54:40 +01:00
dependabot[bot]	90edfdbbe7	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 3.0.18 to 3.0.19. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`6441882669...6ae615f647`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-06 13:09:43 +01:00
dependabot[bot]	f998f00580	chore(deps): bump tj-actions/changed-files from 45.0.5 to 45.0.6 Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 45.0.5 to 45.0.6. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](`bab30c2299...d6e91a2266`) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-06 13:09:30 +01:00
Mayeul@Zama	57a31d19a7	test(shortint): remove oprf test flakiness	2025-01-03 18:34:02 +01:00
David Testé	9c43c30e66	chore(ci): fix concurrency group format on pull request event Since the addition of pull_request_target event, github.ref context object return name of the base branch. So when a workflow was triggered on the base branch during an execution in a pull request, the latter would be cancelled. Using github.head_ref, when available, fixes this behavior. On any other event than pull_request or pull_request_target, github.ref will still be used and work as before.	2025-01-03 17:18:41 +01:00
Arthur Meyre	a9d48c7e35	chore: force wasm-bindgen-rayon to 1.2.2 the new 1.3.0 version changes the way some files are bundled, I don't want to discover during the release that nothing works properly anymore.	2025-01-02 14:50:58 +01:00
Nicolas Sarlin	863d51feaf	chore(hl): remove unused traits	2025-01-02 13:52:37 +01:00
Nicolas Sarlin	ae2aeb3b6b	chore(core_crypto): remove unused cfg(bench)	2025-01-02 13:52:37 +01:00
dependabot[bot]	5c44ffad27	chore(deps): bump codecov/codecov-action from 5.1.1 to 5.1.2 Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5.1.1 to 5.1.2. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](`7f8b4b4bde...1e68e06f1d`) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-02 13:12:35 +01:00
dependabot[bot]	e42d203fc5	chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 3.0.17 to 3.0.18. - [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases) - [Commits](`5d6ac37a4c...6441882669`) --- updated-dependencies: - dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-02 13:12:27 +01:00
dependabot[bot]	37f25c0ce5	chore(deps): bump actions/upload-artifact from 4.4.3 to 4.5.0 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.4.3 to 4.5.0. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](`b4b15b8c7c...6f51ac03b9`) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com>	2025-01-02 13:12:20 +01:00
Guillermo Oyarzun	cd03b7eef7	feat(gpu): implement vector comparisons gpu	2024-12-23 17:06:55 +01:00
Andrei Stoian	2c8f0ce7de	feat(gpu): optimize packing keyswitch in ML special case	2024-12-23 10:32:23 -03:00
Agnes Leroy	e3a93c7d87	chore(ci): add randomized long run tests on CPU and GPU	2024-12-20 17:13:58 +01:00
Agnes Leroy	9b43a9459a	chore(gpu): add option to pre-release some buffers in scalar mul	2024-12-20 11:53:31 +01:00
David Testé	33d5091025	chore(deps): bump zama-ai/slab-github-runner	2024-12-20 10:56:22 +01:00
Guillermo Oyarzun	70ff0f726c	feat(gpu): implement subarray search	2024-12-20 08:54:35 +01:00
Agnes Leroy	13d55f31ac	chore(gpu): minor fix in core crypto comments	2024-12-19 15:54:40 +01:00
Agnes Leroy	7e871e54e1	chore(gpu): fix inconsistency in the use of AmortizedDegree	2024-12-19 15:54:40 +01:00
Agnes Leroy	012585204a	chore(gpu): add inputs to erc20 throughput bench with multiple GPUs	2024-12-19 15:03:11 +01:00
Agnes Leroy	d6e45858c1	chore(gpu): rework single carry proip to avoid using local streams	2024-12-19 10:02:14 +01:00
Mayeul@Zama	ae832c158f	chore(csprng): cleanup conditional imports	2024-12-19 09:59:04 +01:00
Mayeul@Zama	8504d79180	chore(core): remove unused file	2024-12-19 09:59:04 +01:00
David Testé	c306e63430	chore(ci): fix secret token naming to avoid collision	2024-12-18 19:56:36 +01:00
David Testé	9195753273	chore(ci): verify triggering actor on pull request from fork If a contributor that open a Pull Request from a fork is not part of the repository collaborators, then the workflow using check_triggering_actor subworkflowwill exit with a failure. It could be re-run later by a collaborator who has a write access. This allows reviewers to read the code proposition before running the CI, ensuring no secrets are leaked outside the repository.	2024-12-18 18:44:52 +01:00
David Testé	bda8ab028e	chore(ci): allow external contribution in fast aws tests workflow	2024-12-18 17:17:58 +01:00
Beka Barbakadze	9e8db2179e	fix(gpu): fix noise level calculation in full propagation	2024-12-18 14:26:56 +01:00
dependabot[bot]	950cece2a9	chore(deps): bump dtolnay/rust-toolchain Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from 315e265cd78dad1e1dcf3a5074f6d6c47029d5aa to a54c7afa936fefeb4456b2dd8068152669aa8203. - [Release notes](https://github.com/dtolnay/rust-toolchain/releases) - [Commits](`315e265cd7...a54c7afa93`) --- updated-dependencies: - dependency-name: dtolnay/rust-toolchain dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com>	2024-12-18 11:39:47 +01:00
Agnes Leroy	aee53d3fae	fix(gpu): fix lut size in are_all_comparison_blocks_true	2024-12-18 09:11:07 +01:00
Nicolas Sarlin	4e2db929da	chore(csprng): prepare release 0.5.0	2024-12-17 09:22:08 +01:00
Nicolas Sarlin	d2c13e4593	chore(doc): fix c_api setup command using removed make target	2024-12-17 09:22:08 +01:00
Nicolas Sarlin	c41b76f892	chore(csprng)!: moved the RandomGenerator detector into tfhe-csprng	2024-12-17 09:22:08 +01:00
Nicolas Sarlin	1ede004e9a	chore(tfhe)!: remove arch specific features BREAKING_CHANGE: - The x86_64, x86_64-unix, aarch64, aarch64-unix have been removed, the target architecture and os family are now automatically detected. A `software_prng` feature has been added to force the use of a software implementation on older CPUs	2024-12-17 09:22:08 +01:00
Nicolas Sarlin	1df331d246	chore(csprng)!: remove generator_aarch64_aes feature BREAKING_CHANGE: - The `generator_aarch64_aes` feature is no longer supported for tfhe-csprng	2024-12-17 09:22:08 +01:00
Nicolas Sarlin	0f2451e3b7	chore(csprng)!: remove generator_x86_64_aesni feature BREAKING_CHANGE: - The `generator_x86_64_aesni` feature is no longer supported for tfhe-csprng	2024-12-17 09:22:08 +01:00
Nicolas Sarlin	3de23d14a2	chore(csprng)!: remove seeder_x86_64_rdseed feature BREAKING_CHANGE: - The `seeder_x86_64_rdseed` feature is no longer supported for tfhe-csprng	2024-12-17 09:22:08 +01:00
Nicolas Sarlin	e0ee8af1ac	chore(csprng)!: remove seeder_unix feature BREAKING_CHANGE: - The `seeder_unix` feature is no longer supported for tfhe-csprng	2024-12-17 09:22:08 +01:00
Agnes Leroy	072005d521	fix(gpu): fix memory leak	2024-12-17 08:58:16 +01:00
Agnes Leroy	241b73704c	fix(gpu): fix ct degree and noise level after some ops	2024-12-17 08:58:03 +01:00
Agnes Leroy	8687b69769	fix(gpu): fix single gpu on device other than 0	2024-12-17 08:57:40 +01:00
Nicolas Sarlin	cdb81dd262	doc(shortint): add some clarification about shortint size	2024-12-16 16:28:54 +01:00
Nicolas Sarlin	03956a9a24	chore(zk): check that k <= d for zk crs	2024-12-16 16:00:15 +01:00
Nicolas Sarlin	ef684649f9	chore(backward): move allow(dead_code) to dispatch variants This allows to detect unused dispatch enums	2024-12-16 16:00:15 +01:00
Nicolas Sarlin	fc642c6f26	chore(zk)!: update parameters for zk v2	2024-12-16 16:00:15 +01:00
Nicolas Sarlin	c2a999d300	feat(zk)!: plug zk v2 BREAKING CHANGE: - The object ZkVerificationOutCome has been renamed ZkVerificationOutcome. - Conformance of proofs now checks the scheme version of the CRS. This is breaking at the shortint and core_crypto levels, and for manually built integer conformance params. New CRS will be generated with the V2 Scheme by default, but V1 CRS and proofs are still accepted, so this is not breaking. New methods have been added to generate a V1 CRS.	2024-12-16 16:00:15 +01:00
Nicolas Sarlin	ae3e5f1a32	fix(zk-pok): missing Versionize for ComputeLoadProofFields	2024-12-16 16:00:15 +01:00
Nicolas Sarlin	3dcb982a0b	feat(versionable): "Version" macro now handles transparent attribute	2024-12-16 16:00:15 +01:00
Agnes Leroy	e9c901b3a9	chore(gpu): rework select to avoid using local streams	2024-12-16 15:26:14 +01:00
Mayeul@Zama	2d8907dfed	chore: fix clippy lints after toolchain update	2024-12-16 14:17:20 +01:00
Agnes Leroy	06f8fc8962	chore(gpu): make artifact name unique across different machines	2024-12-16 14:09:30 +01:00
Nicolas Sarlin	381aeb572f	chore(all): remove the dependency to lazy_static	2024-12-16 11:24:20 +01:00
Arthur Meyre	3a99ee9718	chore: remove aliases for gaussian parameters for compact PK - we are TUniform by default so no more aliases for gaussian parameters	2024-12-16 09:50:12 +01:00
Agnes Leroy	86f07045fe	chore(gpu): run pbs in parallel in difference_check	2024-12-16 09:23:41 +01:00
Mayeul@Zama	b1ce34f8a7	chore(hlapi): stabilize FheTypes	2024-12-13 18:31:30 +01:00
Agnes Leroy	4388a3dc99	chore(gpu): add sxm5 vm target	2024-12-13 17:25:55 +01:00
Arthur Meyre	805436839d	fix(shortint): fix compression encoding change not being taken into account - this maps better to what was optimized and will dramatically diminish the pfail as we now have 2 more bits for the LUT redundancy	2024-12-13 16:41:13 +01:00
Arthur Meyre	bdbec55e84	chore: do not crash when ark-ff or wasm_bindgen macros have cfg issues	2024-12-13 16:31:25 +01:00
Arthur Meyre	33131c664a	chore(ci): toolchain update	2024-12-13 16:31:25 +01:00
Arthur Meyre	1151bb267e	chore: update dependencies	2024-12-13 16:31:25 +01:00
Agnes Leroy	ce9679f1ee	doc(gpu): add an example to use arrays on GPU	2024-12-13 10:46:28 +01:00
Agnes Leroy	23b43c33c7	fix(gpu): fix scalar ne	2024-12-12 11:26:51 +01:00
Agnes Leroy	6feaf49906	chore(gpu): remove stream sync in broadcast lut	2024-12-12 10:19:02 +01:00
Agnes Leroy	25f4e5f279	fix(gpu): fix equal	2024-12-12 09:21:44 +01:00
David Testé	c1f05cbf85	chore(ci): use composite action to setup hyperstack instance	2024-12-12 09:18:33 +01:00