clippy bug no span for large array on stack

chore(ci): toolchain update
chore: update dependencies
2026-04-28 03:01:21 -04:00 · 2024-12-03 11:00:31 +01:00 · 2024-12-02 16:34:19 +01:00 · 2024-12-02 11:12:46 +01:00
540 changed files with 12643 additions and 32174 deletions
--- a/.github/actions/hyperstack_setup/action.yml
+++ b/.github/actions/hyperstack_setup/action.yml
@@ -1,53 +0,0 @@
-name: Setup Cuda
-description: Setup Cuda on Hyperstack instance
-
-inputs:
-  cuda-version:
-    description: Version of Cuda to use
-    required: true
-  gcc-version:
-    description: Version of GCC to use
-    required: true
-  cmake-version:
-    description: Version of cmake to use
-    default: 3.29.6
-
-runs:
-  using: "composite"
-  steps:
-    # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-    - name: Install dependencies
-      shell: bash
-      run: |
-        sudo apt update
-        sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
-        wget https://github.com/Kitware/CMake/releases/download/v${{ inputs.cmake-version }}/cmake-${{ inputs.cmake-version }}.tar.gz
-        tar -zxvf cmake-${{ inputs.cmake-version }}.tar.gz
-        cd cmake-${{ inputs.cmake-version }}
-        ./bootstrap
-        make -j"$(nproc)"
-        sudo make install
-
-    - name: Export CUDA variables
-      shell: bash
-      run: |
-        CUDA_PATH=/usr/local/cuda-${{ inputs.cuda-version }}
-        echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-        echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-        echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-        echo "CUDACXX=/usr/local/cuda-${{ inputs.cuda-version }}/bin/nvcc" >> "${GITHUB_ENV}"
-
-    # Specify the correct host compilers
-    - name: Export gcc and g++ variables
-      shell: bash
-      run: |
-        {
-          echo "CC=/usr/bin/gcc-${{ inputs.gcc-version }}";
-          echo "CXX=/usr/bin/g++-${{ inputs.gcc-version }}";
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ inputs.gcc-version }}";
-          echo "HOME=/home/ubuntu";
-        } >> "${GITHUB_ENV}"
-
-    - name: Check device is detected
-      shell: bash
-      run: nvidia-smi
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -100,7 +100,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -11,26 +11,16 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-  pull_request_target:

 jobs:
-  check-user-permission:
-    if: github.event_name == 'pull_request' || github.event_name == 'pull_request_target'
-    uses: ./.github/workflows/check_triggering_actor.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  should-run:
    runs-on: ubuntu-latest
-    needs: check-user-permission
-    if: github.event_name != 'pull_request_target' ||
-      needs.check-user-permission.result == 'success'
    permissions:
      pull-requests: write
    outputs:
@@ -65,11 +55,10 @@ jobs:
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-          ref: ${{ github.event.pull_request.head.sha }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -144,7 +133,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -159,7 +148,7 @@ jobs:
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    needs: [ should-run, setup-instance ]
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -168,10 +157,9 @@ jobs:
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-          ref: ${{ github.event.pull_request.head.sha }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -211,7 +199,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
+        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
        with:
          path: |
            ~/.nvm
@@ -224,7 +212,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
+        uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -282,7 +270,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -59,7 +59,6 @@ jobs:
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
              - tfhe/src/integer/**
-              - .github/workflows/aws_tfhe_integer_tests.yml

  setup-instance:
    name: Setup instance (unsigned-integer-tests)
@@ -75,7 +74,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +98,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -140,7 +139,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -59,7 +59,6 @@ jobs:
              - tfhe/src/core_crypto/**
              - tfhe/src/shortint/**
              - tfhe/src/integer/**
-              - .github/workflows/aws_tfhe_signed_integer_tests.yml

  setup-instance:
    name: Setup instance (unsigned-integer-tests)
@@ -75,7 +74,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +98,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -144,7 +143,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -67,7 +67,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -142,7 +142,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -250,7 +250,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -61,7 +61,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
+        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
        with:
          path: |
            ~/.nvm
@@ -74,7 +74,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
+        uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -119,7 +119,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -93,7 +93,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_boolean
          path: ${{ env.RESULTS_FILENAME }}
@@ -127,7 +127,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -26,7 +26,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -58,7 +58,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -81,7 +81,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -115,7 +115,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -97,7 +97,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_erc20
          path: ${{ env.RESULTS_FILENAME }}
@@ -124,7 +124,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -54,7 +54,7 @@ jobs:
          echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -82,7 +82,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_integer_multi_bit_gpu_default
          path: ${{ env.RESULTS_FILENAME }}
@@ -127,7 +127,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -157,7 +157,7 @@ jobs:
      

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,19 +48,28 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Get benchmark details
        run: |
          {
@@ -75,10 +84,31 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
      - name: Run benchmarks with AVX512
        run: |
          make bench_pbs_gpu
@@ -98,7 +128,7 @@ jobs:
          --walk-subdirs

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_core_crypto
          path: ${{ env.RESULTS_FILENAME }}
@@ -137,7 +167,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_erc20.yml
+++ b/.github/workflows/benchmark_gpu_erc20.yml
@@ -12,10 +12,7 @@ on:
          - "l40 (n3-L40x1)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
-          - "4-h100 (n3-H100x4)"
          - "multi-h100 (n3-H100x8)"
-          - "multi-h100-nvlink (n3-H100x8-NVLink)"
-          - "multi-h100-sxm5 (n3-H100x8-SXM5)"

 jobs:
  parse-inputs:
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -54,7 +54,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -75,19 +75,28 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Get benchmark details
        run: |
          {
@@ -102,10 +111,34 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run benchmarks
        run: |
          make bench_hlapi_erc20_gpu
@@ -124,9 +157,9 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_erc20_${{ inputs.profile }}
+          name: ${{ github.sha }}_erc20
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
@@ -163,7 +196,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_gpu_integer.yml
+++ b/.github/workflows/benchmark_gpu_integer.yml
@@ -15,7 +15,6 @@ on:
          - "4-h100 (n3-H100x4)"
          - "multi-h100 (n3-H100x8)"
          - "multi-h100-nvlink (n3-H100x8-NVLink)"
-          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
          - "multi-a100-nvlink (n3-A100x8-NVLink)"
      command:
        description: "Benchmark command to run"
--- a/.github/workflows/benchmark_gpu_integer_common.yml
+++ b/.github/workflows/benchmark_gpu_integer_common.yml
@@ -118,7 +118,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -145,19 +145,28 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Get benchmark details
        run: |
          {
@@ -172,10 +181,41 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CUDA_PATH=$CUDA_PATH";
+            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
+            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
+          } >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+          } >> "${GITHUB_ENV}"
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
        run: |
@@ -196,22 +236,14 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --bench-type ${{ matrix.bench_type }}
+          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
-          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}
+          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}

-      - name: Checkout Slab repo
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          repository: zama-ai/slab
-          path: slab
-          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
-
      - name: Send data to Slab
        shell: bash
        run: |
@@ -239,7 +271,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -90,7 +90,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -170,7 +170,7 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -197,7 +197,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -56,7 +56,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -136,7 +136,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -163,7 +163,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -90,7 +90,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -164,7 +164,7 @@ jobs:
          --bench-type ${{ matrix.bench_type }}

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -191,7 +191,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -32,7 +32,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,7 +84,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_fft
          path: ${{ env.RESULTS_FILENAME }}
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -32,7 +32,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,7 +84,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_ntt
          path: ${{ env.RESULTS_FILENAME }}
@@ -126,7 +126,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -36,7 +36,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -58,7 +58,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +91,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -121,7 +121,7 @@ jobs:
          --name-suffix avx512

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_tfhe_zk_pok
          path: ${{ env.RESULTS_FILENAME }}
@@ -155,7 +155,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -40,7 +40,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -65,7 +65,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -99,7 +99,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -109,7 +109,7 @@ jobs:

      - name: Node cache restoration
        id: node-cache
-        uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
+        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
        with:
          path: |
            ~/.nvm
@@ -122,7 +122,7 @@ jobs:
          make install_node

      - name: Node cache save
-        uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 #v4.2.0
+        uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a #v4.1.2
        if: steps.node-cache.outputs.cache-hit != 'true'
        with:
          path: |
@@ -166,7 +166,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_wasm_${{ matrix.browser }}
          path: ${{ env.RESULTS_FILENAME }}
@@ -200,7 +200,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -47,7 +47,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -104,7 +104,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -141,7 +141,7 @@ jobs:
          } >> "${GITHUB_ENV}"

      - name: Install rust
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: nightly

@@ -177,7 +177,7 @@ jobs:
          --append-results

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882
        with:
          name: ${{ github.sha }}_integer_zk
          path: ${{ env.RESULTS_FILENAME }}
@@ -211,7 +211,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/cargo_build.yml
+++ b/.github/workflows/cargo_build.yml
@@ -28,7 +28,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

--- a/.github/workflows/check_triggering_actor.yml
+++ b/.github/workflows/check_triggering_actor.yml
@@ -1,29 +0,0 @@
-# Check if triggering actor is a collaborator and has write access
-name: Check Triggering Actor
-
-on:
-  workflow_call:
-    secrets:
-      TOKEN:
-        required: true
-
-jobs:
-  check-actor-permission:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Get User Permission
-        id: check-access
-        uses: actions-cool/check-user-permission@956b2e73cdfe3bcb819bb7225e490cb3b18fd76e # v2.2.1
-        with:
-          require: write
-          username: ${{ github.triggering_actor }}
-        env:
-          GITHUB_TOKEN: ${{ secrets.TOKEN }}
-
-      - name: Check User Permission
-        if: steps.check-access.outputs.require-result == 'false'
-        run: |
-          echo "${{ github.triggering_actor }} does not have permissions on this repo."
-          echo "Current permission level is ${{ steps.check-access.outputs.user-permission }}"
-          echo "Job originally triggered by ${{ github.actor }}"
-          exit 1
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -27,7 +27,7 @@ jobs:
          make lint_workflow

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@6ae615f6475d2ede5ad88bea6baa7a1d5e93ffaa # v3.0.19
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@5d6ac37a4cef8b8df67f482a8e384987766f0213 # v3.0.17
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -47,13 +47,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          files_yaml: |
            tfhe:
@@ -83,7 +83,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
+        uses: codecov/codecov-action@015f24e6818733317a2da2edd6290ab26238649a
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
+        uses: codecov/codecov-action@015f24e6818733317a2da2edd6290ab26238649a
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -121,7 +121,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -51,7 +51,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -40,7 +40,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,28 +94,60 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run core crypto and internal CUDA backend tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
@@ -155,7 +187,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -34,7 +34,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -66,7 +66,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -92,28 +92,60 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run core crypto and internal CUDA backend tests
        run: |
          make test_core_crypto_gpu
@@ -153,7 +185,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -25,7 +25,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,6 +49,9 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
      - name: Install dependencies
@@ -68,21 +71,38 @@ jobs:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run core crypto, integer and internal CUDA backend tests
        run: |
          make test_gpu
@@ -119,7 +139,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,28 +94,60 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run multi-bit CUDA integer compression tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
@@ -158,7 +190,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -1,4 +1,4 @@
-name: Long Run Tests on GPU
+name: AWS Long Run Tests on GPU

 env:
  CARGO_TERM_COLOR: always
@@ -15,8 +15,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Weekly tests will be triggered each Friday at 9p.m.
-    - cron: "0 21 * * 5"
+    # Weekly tests will be triggered each Friday at 1a.m.
+    - cron: '0 1 * * FRI'

 jobs:
  setup-instance:
@@ -29,17 +29,17 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: hyperstack
-          profile: multi-gpu-test
+          profile: single-h100

  cuda-tests:
-    name: Long run GPU tests
+    name: Long run GPU H100 tests
    needs: [ setup-instance ]
    concurrency:
      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
@@ -53,26 +53,57 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
-    timeout-minutes: 4320 # 72 hours
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run tests
        run: |
          make test_integer_long_run_gpu
@@ -88,7 +119,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests.result }}
-          SLACK_MESSAGE: "Integer GPU long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer GPU H100 long run tests finished with status: ${{ needs.cuda-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (gpu-tests)
@@ -98,7 +129,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -24,7 +24,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -63,7 +63,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -110,7 +110,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,25 +94,58 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run signed integer tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_gpu_ci
@@ -138,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,25 +94,58 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run signed integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_signed_integer_multi_bit_gpu_ci
@@ -138,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -42,7 +42,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -75,7 +75,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -101,28 +101,57 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
      - name: Should run nightly tests
        if: github.event_name == 'schedule'
        run: |
@@ -131,6 +160,10 @@ jobs:
            echo "NIGHTLY_TESTS=TRUE";
          } >> "${GITHUB_ENV}"

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run signed integer multi-bit tests
        run: |
          make test_signed_integer_multi_bit_gpu_ci
@@ -156,7 +189,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,25 +94,58 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run unsigned integer tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_gpu_ci
@@ -138,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -35,7 +35,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -68,7 +68,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -94,25 +94,58 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11 
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run unsigned integer multi-bit tests
        run: |
          BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_multi_bit_gpu_ci
@@ -138,7 +171,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -41,7 +41,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
+        uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf
        with:
          since_last_remote_commit: true
          files_yaml: |
@@ -74,7 +74,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,25 +100,54 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
+    env:
+      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
+      CMAKE_VERSION: 3.29.6
    steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
+
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

-      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/hyperstack_setup
-        with:
-          cuda-version: ${{ matrix.cuda }}
-          gcc-version: ${{ matrix.gcc }}
-
      - name: Set up home
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

+      - name: Export CUDA variables
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+          echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+          echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+      # Specify the correct host compilers
+      - name: Export gcc and g++ variables
+        if: ${{ !cancelled() }}
+        run: |
+          {
+            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
+            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
+          } >> "${GITHUB_ENV}"
+
      - name: Should run nightly tests
        if: github.event_name == 'schedule'
        run: |
@@ -127,6 +156,10 @@ jobs:
            echo "NIGHTLY_TESTS=TRUE";
          } >> "${GITHUB_ENV}"

+      - name: Check device is detected
+        if: ${{ !cancelled() }}
+        run: nvidia-smi
+
      - name: Run unsigned integer multi-bit tests
        run: |
          make test_unsigned_integer_multi_bit_gpu_ci
@@ -152,7 +185,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -15,8 +15,8 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  schedule:
-    # Weekly tests will be triggered each Friday at 9p.m.
-    - cron: "0 21 * * 5"
+    # Weekly tests will be triggered each Friday at 1a.m.
+    - cron: '0 1 * * FRI'

 jobs:
  setup-instance:
@@ -29,7 +29,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,7 +45,6 @@ jobs:
      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
-    timeout-minutes: 4320 # 72 hours
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -54,7 +53,7 @@ jobs:
          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -78,7 +77,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -39,7 +39,7 @@ jobs:
          persist-credentials: "false"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -50,7 +50,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe
-      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
+      - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -36,7 +36,7 @@ jobs:
    steps:
      - name: Start instance
        id: start-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
+        uses: dtolnay/rust-toolchain@315e265cd78dad1e1dcf3a5074f6d6c47029d5aa
        with:
          toolchain: stable

@@ -120,7 +120,7 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        uses: zama-ai/slab-github-runner@98f0788261a7323d5d695a883e20df36591a92b7
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
--- a/233
+++ b/233
@@ -2,6 +2,7 @@ SHELL:=$(shell /usr/bin/env which bash)
 OS:=$(shell uname)
 RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
 CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
+TARGET_ARCH_FEATURE:=$(shell ./scripts/get_arch_feature.sh)
 CPU_COUNT=$(shell ./scripts/cpu_count.sh)
 RS_BUILD_TOOLCHAIN:=stable
 CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -281,14 +282,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
 		--all-targets \
 		-p $(TFHE_SPEC)

@@ -307,51 +308,52 @@ lint_workflow: check_actionlint_installed
 .PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
 clippy_core: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
+		--features=$(TARGET_ARCH_FEATURE) \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=experimental \
+		--features=$(TARGET_ARCH_FEATURE),experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=nightly-avx512 \
+		--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=experimental,nightly-avx512 \
+		--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=zk-pok \
+		--features=$(TARGET_ARCH_FEATURE),zk-pok \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_boolean # Run clippy lints enabling the boolean features
 clippy_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean \
+		--features=$(TARGET_ARCH_FEATURE),boolean \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_shortint # Run clippy lints enabling the shortint features
 clippy_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=shortint \
+		--features=$(TARGET_ARCH_FEATURE),shortint \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=shortint,experimental \
+		--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=zk-pok,shortint \
+		--features=$(TARGET_ARCH_FEATURE),zk-pok,shortint \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_integer # Run clippy lints enabling the integer features
 clippy_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=integer \
+		--features=$(TARGET_ARCH_FEATURE),integer \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=integer,experimental \
+		--features=$(TARGET_ARCH_FEATURE),integer,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
 clippy: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_rustdoc # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
@@ -362,13 +364,13 @@ clippy_rustdoc: install_rs_check_toolchain
 	fi && \
 	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok,pbs-stats,strings \
 		-p $(TFHE_SPEC)

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
 clippy_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
@@ -393,16 +395,17 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
 clippy_tfhe_csprng: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=parallel,software-prng -p tfhe-csprng -- --no-deps -D warnings
+		--features=$(TARGET_ARCH_FEATURE) \
+		-p tfhe-csprng -- --no-deps -D warnings

 .PHONY: clippy_zk_pok # Run clippy lints on tfhe-zk-pok
 clippy_zk_pok: install_rs_check_toolchain
@@ -441,67 +444,67 @@ check_rust_bindings_did_not_change:
 .PHONY: tfhe_lints # Run custom tfhe-rs lints
 tfhe_lints: install_tfhe_lints
 	cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
-		--features=boolean,shortint,integer,zk-pok -- -D warnings
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok -- -D warnings

 .PHONY: build_core # Build core_crypto without experimental features
 build_core: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		-p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE) -p $(TFHE_SPEC)
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=nightly-avx512 -p $(TFHE_SPEC); \
+			--features=$(TARGET_ARCH_FEATURE),nightly-avx512 -p $(TFHE_SPEC); \
 	fi

 .PHONY: build_core_experimental # Build core_crypto with experimental features
 build_core_experimental: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=experimental -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),experimental -p $(TFHE_SPEC)
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-			--features=experimental,nightly-avx512 -p $(TFHE_SPEC); \
+			--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 -p $(TFHE_SPEC); \
 	fi

 .PHONY: build_boolean # Build with boolean enabled
 build_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean -p $(TFHE_SPEC) --all-targets
+		--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) --all-targets

 .PHONY: build_shortint # Build with shortint enabled
 build_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=shortint -p $(TFHE_SPEC) --all-targets
+		--features=$(TARGET_ARCH_FEATURE),shortint -p $(TFHE_SPEC) --all-targets

 .PHONY: build_integer # Build with integer enabled
 build_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=integer -p $(TFHE_SPEC) --all-targets
+		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) --all-targets

 .PHONY: build_tfhe_full # Build with boolean, shortint and integer enabled
 build_tfhe_full: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets

 .PHONY: build_tfhe_coverage # Build with test coverage enabled
 build_tfhe_coverage: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests

 .PHONY: build_c_api # Build the C API for boolean, shortint and integer
 build_c_api: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok \
 		-p $(TFHE_SPEC)

 .PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
 build_c_api_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,gpu \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,gpu \
 		-p $(TFHE_SPEC)

 .PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
 build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,zk-pok,experimental-force_fft_algo_dif4 \
 		-p $(TFHE_SPEC)

 .PHONY: build_web_js_api # Build the js API targeting the web browser
@@ -532,15 +535,15 @@ build_node_js_api: install_rs_build_toolchain install_wasm_pack
 .PHONY: build_tfhe_csprng # Build tfhe_csprng
 build_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
-		-p tfhe-csprng --all-targets
+		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng --all-targets

 .PHONY: test_core_crypto # Run the tests of the core_crypto module including experimental ones
 test_core_crypto: install_rs_build_toolchain install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
+		--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok -p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-			--features=experimental,zk-pok,nightly-avx512 -p $(TFHE_SPEC) -- core_crypto::; \
+			--features=$(TARGET_ARCH_FEATURE),experimental,zk-pok,nightly-avx512 -p $(TFHE_SPEC) -- core_crypto::; \
 	fi

 .PHONY: test_core_crypto_cov # Run the tests of the core_crypto module with code coverage
@@ -548,13 +551,13 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
 		--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-		--features=experimental,internal-keycache \
+		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
 		-p $(TFHE_SPEC) -- core_crypto::
 	@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
 		RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 			--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
 			--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
-			--features=experimental,internal-keycache,nightly-avx512 \
+			--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,nightly-avx512 \
 			-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
 	fi

@@ -572,38 +575,35 @@ test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
 .PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
 test_core_crypto_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
+		--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::

 .PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
 test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key:: --test-threads=6
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

-.PHONY: test_integer_long_run_gpu # Run the long run integer tests on the gpu backend
-test_integer_long_run_gpu: install_rs_check_toolchain install_cargo_nextest
-	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
-	LONG_TESTS=TRUE \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)" --backend "gpu"
+.PHONY: test_integer_long_run_gpu # Run the tests of the integer module including experimental on the gpu backend
+test_integer_long_run_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu,__long_run_tests -p $(TFHE_SPEC) -- integer::gpu::server_key::radix::tests_long_run --test-threads=6

 .PHONY: test_integer_compression
 test_integer_compression: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer -p $(TFHE_SPEC) -- integer::ciphertext::compressed_ciphertext_list::tests::
+		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compressed_ciphertext_list::tests::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=integer -p $(TFHE_SPEC) -- integer::ciphertext::compress
+		--features=$(TARGET_ARCH_FEATURE),integer -p $(TFHE_SPEC) -- integer::ciphertext::compress

 .PHONY: test_integer_compression_gpu
 test_integer_compression_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
-		--features=integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress

 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
@@ -662,20 +662,20 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
 .PHONY: test_boolean # Run the tests of the boolean module
 test_boolean: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean -p $(TFHE_SPEC) -- boolean::
+		--features=$(TARGET_ARCH_FEATURE),boolean -p $(TFHE_SPEC) -- boolean::

 .PHONY: test_boolean_cov # Run the tests of the boolean module with code coverage
 test_boolean_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=boolean,internal-keycache \
+		--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
 		-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::

 .PHONY: test_c_api_rs # Run the rust tests for the C API
 test_c_api_rs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean-c-api,shortint-c-api,high-level-c-api \
+		--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
 		-p $(TFHE_SPEC) \
 		c_api

@@ -707,14 +707,14 @@ test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
 .PHONY: test_shortint # Run all the tests for shortint
 test_shortint: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=shortint,internal-keycache -p $(TFHE_SPEC) -- shortint::
+		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p $(TFHE_SPEC) -- shortint::

 .PHONY: test_shortint_cov # Run the tests of the shortint module with code coverage
 test_shortint_cov: install_rs_check_toolchain install_tarpaulin
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
 		--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
 		$(COVERAGE_EXCLUDED_FILES) \
-		--features=shortint,internal-keycache \
+		--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
 		-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::

 .PHONY: test_integer_ci # Run the tests for integer ci
@@ -771,28 +771,26 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
 		--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
 		--signed-only --tfhe-package "$(TFHE_SPEC)"

-.PHONY: test_integer_long_run # Run the long run integer tests
-test_integer_long_run: install_rs_check_toolchain install_cargo_nextest
-	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
-	LONG_TESTS=TRUE \
-		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
-		--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
-		--tfhe-package "$(TFHE_SPEC)"
+.PHONY: test_integer_long_run # Run the long run tests for integer
+test_integer_long_run: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+						--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,__long_run_tests -p $(TFHE_SPEC) -- integer::server_key::radix_parallel::tests_long_run
+

 .PHONY: test_safe_serialization # Run the tests for safe serialization
 test_safe_serialization: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_serialization::
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) -- safe_serialization::

 .PHONY: test_zk # Run the tests for the zk module of the TFHE-rs crate
 test_zk: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=shortint,zk-pok -p $(TFHE_SPEC) -- zk::
+		--features=$(TARGET_ARCH_FEATURE),shortint,zk-pok -p $(TFHE_SPEC) -- zk::

 .PHONY: test_integer # Run all the tests for integer
 test_integer: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=integer,internal-keycache -p $(TFHE_SPEC) -- integer::
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::

 .PHONY: test_integer_cov # Run the tests of the integer module with code coverage
 test_integer_cov: install_rs_check_toolchain install_tarpaulin
@@ -800,38 +798,38 @@ test_integer_cov: install_rs_check_toolchain install_tarpaulin
 		--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
 		--implicit-test-threads \
 		--exclude-files $(COVERAGE_EXCLUDED_FILES) \
-		--features=integer,internal-keycache \
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
 		-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::

 .PHONY: test_high_level_api # Run all the tests for high_level_api
 test_high_level_api: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok -p $(TFHE_SPEC) \
 		-- high_level_api::

 test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
-		--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
 		-E "test(/high_level_api::.*gpu.*/)"

 .PHONY: test_strings # Run the tests for strings ci
 test_strings: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--features=shortint,integer,strings -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),shortint,integer,strings -p $(TFHE_SPEC) \
 		-- strings::


 .PHONY: test_user_doc # Run tests from the .md documentation
 test_user_doc: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
 		-p $(TFHE_SPEC) \
 		-- test_user_docs::

 .PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
 test_user_doc_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
-		--features=boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
 		-- test_user_docs::


@@ -839,12 +837,14 @@ test_user_doc_gpu: install_rs_build_toolchain
 .PHONY: test_regex_engine # Run tests for regex_engine example
 test_regex_engine: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--example regex_engine --features=integer
+		--example regex_engine \
+		--features=$(TARGET_ARCH_FEATURE),integer

 .PHONY: test_sha256_bool # Run tests for sha256_bool example
 test_sha256_bool: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		--example sha256_bool --features=boolean
+		--example sha256_bool \
+		--features=$(TARGET_ARCH_FEATURE),boolean

 .PHONY: test_examples # Run tests for examples
 test_examples: test_sha256_bool test_regex_engine
@@ -862,7 +862,7 @@ test_kreyvium: install_rs_build_toolchain
 .PHONY: test_tfhe_csprng # Run tfhe-csprng tests
 test_tfhe_csprng: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe-csprng
+		--features=$(TARGET_ARCH_FEATURE) -p tfhe-csprng

 .PHONY: test_zk_pok # Run tfhe-zk-pok tests
 test_zk_pok: install_rs_build_toolchain
@@ -880,7 +880,7 @@ test_zk_wasm_x86_compat_ci: check_nvm_installed
 test_zk_wasm_x86_compat: install_rs_build_toolchain build_node_js_api
 	cd tfhe/tests/zk_wasm_x86_test && npm install
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-		-p tfhe --test zk_wasm_x86_test --features=integer,zk-pok
+		-p tfhe --test zk_wasm_x86_test --features=$(TARGET_ARCH_FEATURE),integer,zk-pok

 .PHONY: test_versionable # Run tests for tfhe-versionable subcrate
 test_versionable: install_rs_build_toolchain
@@ -893,7 +893,7 @@ test_versionable: install_rs_build_toolchain
 test_backward_compatibility_ci: install_rs_build_toolchain
 	TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
 		--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
-		--features=shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
+		--features=$(TARGET_ARCH_FEATURE),shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture

 .PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
 test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
@@ -908,7 +908,7 @@ doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok --no-deps -p $(TFHE_SPEC)

 .PHONY: docs # Build rust doc alias for doc
 docs: doc
@@ -919,7 +919,7 @@ lint_doc: install_rs_check_toolchain
 	DOCS_RS=1 \
 	RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
 	cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
-		--features=boolean,shortint,integer,strings,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,gpu,internal-keycache,experimental,zk-pok -p $(TFHE_SPEC) --no-deps

 .PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
 lint_docs: lint_doc
@@ -948,7 +948,7 @@ check_md_links: install_mlc
 .PHONY: check_compile_tests # Build tests in debug without running them
 check_compile_tests: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=experimental,boolean,shortint,integer,internal-keycache \
+		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache \
 		-p $(TFHE_SPEC)

 	@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
@@ -959,7 +959,7 @@ check_compile_tests: install_rs_build_toolchain
 .PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
 check_compile_tests_benches_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
-		--features=experimental,boolean,shortint,integer,internal-keycache,gpu \
+		--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
 		-p $(TFHE_SPEC)
 	mkdir -p "$(TFHECUDA_BUILD)" && \
 		cd "$(TFHECUDA_BUILD)" && \
@@ -1038,42 +1038,42 @@ dieharder_csprng: install_dieharder build_tfhe_csprng
 .PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
 print_doc_bench_parameters:
 	RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
-	--features=shortint,internal-keycache -p tfhe
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe

 .PHONY: bench_integer # Run benchmarks for unsigned integer
 bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -1081,7 +1081,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1089,7 +1089,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1097,7 +1097,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1105,14 +1105,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
-	--features=integer,internal-keycache,zk-pok,nightly-avx512 \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,zk-pok,nightly-avx512 \
 	-p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
@@ -1120,14 +1120,14 @@ bench_shortint: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_shortint_oprf # Run benchmarks for shortint
 bench_shortint_oprf: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench oprf-shortint-bench \
-	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_shortint_multi_bit # Run benchmarks for shortint using multi-bit parameters
 bench_shortint_multi_bit: install_rs_check_toolchain
@@ -1135,43 +1135,43 @@ bench_shortint_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench shortint-bench \
-	--features=shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_boolean # Run benchmarks for boolean
 bench_boolean: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench boolean-bench \
-	--features=boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_pbs # Run benchmarks for PBS
 bench_pbs: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_pbs128 # Run benchmarks for PBS using FFT 128 bits
 bench_pbs128: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs128-bench \
-	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
 bench_pbs_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_FAST_BENCH=$(FAST_BENCH) cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench pbs-bench \
-	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_ks # Run benchmarks for keyswitch
 bench_ks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench ks-bench \
-	--features=boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 .PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
 bench_ks_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench ks-bench \
-	--features=boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)

 bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
 bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
@@ -1207,13 +1207,13 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
 bench_hlapi_erc20: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_hlapi_erc20_gpu # Run benchmarks for ECR20 operations on GPU
 bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-erc20 \
-	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --

 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
@@ -1228,32 +1228,32 @@ bench_tfhe_zk_pok: install_rs_check_toolchain
 gen_key_cache: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 		--example generates_test_keys \
-		--features=boolean,shortint,experimental,internal-keycache -p $(TFHE_SPEC) \
+		--features=$(TARGET_ARCH_FEATURE),boolean,shortint,experimental,internal-keycache -p $(TFHE_SPEC) \
 		-- $(MULTI_BIT_ONLY) $(COVERAGE_ONLY)

 .PHONY: gen_key_cache_core_crypto # Run function to generate keys and cache them for core_crypto tests
 gen_key_cache_core_crypto: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --tests --profile $(CARGO_PROFILE) \
-		--features=experimental,internal-keycache -p $(TFHE_SPEC) -- --nocapture \
+		--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache -p $(TFHE_SPEC) -- --nocapture \
 		core_crypto::keycache::generate_keys

 .PHONY: measure_hlapi_compact_pk_ct_sizes # Measure sizes of public keys and ciphertext for high-level API
 measure_hlapi_compact_pk_ct_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example hlapi_compact_pk_ct_sizes \
-	--features=integer,internal-keycache
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache

 .PHONY: measure_shortint_key_sizes # Measure sizes of bootstrapping and key switching keys for shortint
 measure_shortint_key_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example shortint_key_sizes \
-	--features=shortint,internal-keycache
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache

 .PHONY: measure_boolean_key_sizes # Measure sizes of bootstrapping and key switching keys for boolean
 measure_boolean_key_sizes: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example boolean_key_sizes \
-	--features=boolean,internal-keycache
+	--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache

 .PHONY: parse_integer_benches # Run python parser to output a csv containing integer benches data
 parse_integer_benches:
@@ -1265,13 +1265,14 @@ parse_integer_benches:
 parse_wasm_benchmarks: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example wasm_benchmarks_parser \
-	--features=shortint,internal-keycache \
+	--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
 	-- wasm_benchmark_results.json

 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
 write_params_to_file: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example write_params_to_file --features=boolean,shortint,internal-keycache
+	--example write_params_to_file \
+	--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache

 .PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
 clone_backward_compat_data:
@@ -1286,26 +1287,26 @@ tfhe/$(BACKWARD_COMPAT_DATA_DIR): clone_backward_compat_data
 .PHONY: regex_engine # Run regex_engine example
 regex_engine: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example regex_engine --features=integer \
+	--example regex_engine \
+	--features=$(TARGET_ARCH_FEATURE),integer \
 	-- $(REGEX_STRING) $(REGEX_PATTERN)

 .PHONY: dark_market # Run dark market example
 dark_market: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example dark_market \
-	--features=integer,internal-keycache \
+	--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
 	-- fhe-modified fhe-parallel plain fhe

 .PHONY: sha256_bool # Run sha256_bool example
 sha256_bool: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
-	--example sha256_bool --features=boolean
+	--example sha256_bool \
+	--features=$(TARGET_ARCH_FEATURE),boolean

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
 pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested check_intra_md_links \
-clippy_all check_compile_tests
-# TFHE lints deactivated as it's incompatible with 1.83 - temporary
-# tfhe_lints
+clippy_all tfhe_lints check_compile_tests

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
 pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change
--- a/README.md
+++ b/README.md
@@ -70,8 +70,22 @@ production-ready library for all the advanced features of TFHE.
 ### Cargo.toml configuration
 To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:

+ For x86_64-based machines running Unix-like OSes:
+
 ```toml
-tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
+tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-unix"] }
+```
+
+ For Apple Silicon or aarch64-based machines running Unix-like OSes:
+
+```toml
+tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
+```
+
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
+
+```toml
+tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
 ```

 > [!Note]
--- a/apps/trivium/Cargo.toml
+++ b/apps/trivium/Cargo.toml
@@ -7,7 +7,14 @@ edition = "2021"

 [dependencies]
 rayon = { workspace = true }
-tfhe = { path = "../../tfhe", features = [ "boolean", "shortint", "integer" ] }
+
+[target.'cfg(target_arch = "x86_64")'.dependencies.tfhe]
+path = "../../tfhe"
+features = [ "boolean", "shortint", "integer", "x86_64" ]
+
+[target.'cfg(target_arch = "aarch64")'.dependencies.tfhe]
+path = "../../tfhe"
+features = [ "boolean", "shortint", "integer", "aarch64-unix" ]

 [dev-dependencies]
 criterion = { version = "0.5.1", features = [ "html_reports" ]}
--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,6 +1,6 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
 use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};
@@ -12,12 +12,12 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -63,12 +63,12 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -109,12 +109,12 @@ pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,6 +1,6 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
 use tfhe_trivium::{TransCiphering, TriviumStreamShortint};
@@ -12,12 +12,12 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -63,12 +63,12 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -109,12 +109,12 @@ pub fn trivium_shortint_trans(c: &mut Criterion) {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,6 +1,6 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
 // commit fd6828f68711276c25f55e605935028f5e843f43
@@ -222,12 +222,12 @@ fn kreyvium_test_shortint_long() {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,6 +1,6 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
+use tfhe::shortint::parameters::PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64;
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
 // file testvectors/trivium-80.80.test-vectors
@@ -358,12 +358,12 @@ fn trivium_test_shortint_long() {
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
+        gen_keys(PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
+        PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.7.0"
+version = "0.6.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -38,7 +38,6 @@ template <typename Torus> struct int_compression {

      scratch_packing_keyswitch_lwe_list_to_glwe_64(
          streams[0], gpu_indexes[0], &fp_ks_buffer,
-          compression_params.small_lwe_dimension,
          compression_params.glwe_dimension, compression_params.polynomial_size,
          num_radix_blocks, true);
    }
@@ -65,7 +64,7 @@ template <typename Torus> struct int_decompression {
  Torus *tmp_extracted_lwe;
  uint32_t *tmp_indexes_array;

-  int_radix_lut<Torus> *decompression_rescale_lut;
+  int_radix_lut<Torus> *carry_extract_lut;

  int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                    uint32_t gpu_count, int_radix_params encryption_params,
@@ -84,7 +83,7 @@ template <typename Torus> struct int_decompression {
      Torus lwe_accumulator_size = (compression_params.glwe_dimension *
                                        compression_params.polynomial_size +
                                    1);
-      decompression_rescale_lut = new int_radix_lut<Torus>(
+      carry_extract_lut = new int_radix_lut<Torus>(
          streams, gpu_indexes, gpu_count, encryption_params, 1,
          num_radix_blocks, allocate_gpu_memory);

@@ -97,28 +96,19 @@ template <typename Torus> struct int_decompression {
          num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
          gpu_indexes[0]);

-      // Rescale is done using an identity LUT
-      // Here we do not divide by message_modulus
-      // Example: in the 2_2 case we are mapping a 2 bits message onto a 4 bits
-      // space, we want to keep the original 2 bits value in the 4 bits space,
-      // so we apply the identity and the encoding will rescale it for us.
-      auto decompression_rescale_f = [encryption_params](Torus x) -> Torus {
-        return x;
+      // Carry extract LUT
+      auto carry_extract_f = [encryption_params](Torus x) -> Torus {
+        return x / encryption_params.message_modulus;
      };

-      auto effective_compression_message_modulus =
-          encryption_params.carry_modulus;
-      auto effective_compression_carry_modulus = 1;
-
-      generate_device_accumulator_with_encoding<Torus>(
-          streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0],
+          carry_extract_lut->get_lut(gpu_indexes[0], 0),
          encryption_params.glwe_dimension, encryption_params.polynomial_size,
-          effective_compression_message_modulus,
-          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
-          decompression_rescale_f);
+          carry_extract_f);

-      decompression_rescale_lut->broadcast_lut(streams, gpu_indexes, 0);
+      carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
    }
  }
  void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -127,8 +117,8 @@ template <typename Torus> struct int_decompression {
    cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
    cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);

-    decompression_rescale_lut->release(streams, gpu_indexes, gpu_count);
-    delete decompression_rescale_lut;
+    carry_extract_lut->release(streams, gpu_indexes, gpu_count);
+    delete carry_extract_lut;
  }
 };
 #endif
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -46,14 +46,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
    bool allocate_gpu_memory);
-void scratch_cuda_apply_many_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, bool allocate_gpu_memory);
+
 void cuda_apply_univariate_lut_kb_64(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count, void *output_radix_lwe,
@@ -447,41 +440,5 @@ void cleanup_cuda_integer_abs_inplace(void *const *streams,
                                      uint32_t gpu_count,
                                      int8_t **mem_ptr_void);

-void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_integer_are_all_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
-
-void cleanup_cuda_integer_are_all_comparisons_block_true(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
-void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory);
-
-void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
-
-void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
-
 } // extern C
 #endif // CUDA_INTEGER_H
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -21,8 +21,8 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(

 void scratch_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t num_lwes, bool allocate_gpu_memory);
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory);

 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, void *glwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -1,358 +0,0 @@
-#ifndef CNCRT_FAST_KS_CUH
-#define CNCRT_FAST_KS_CUH
-
-#undef NDEBUG
-#include <assert.h>
-
-#include "device.h"
-#include "gadget.cuh"
-#include "helper_multi_gpu.h"
-#include "keyswitch.cuh"
-#include "polynomial/functions.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "torus.cuh"
-#include "utils/helper.cuh"
-#include "utils/kernel_dimensions.cuh"
-#include <thread>
-#include <vector>
-
-#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
-
-const int BLOCK_SIZE_GEMM = 64;
-const int THREADS_GEMM = 8;
-const int BLOCK_SIZE_DECOMP = 8;
-
-template <typename Torus> uint64_t get_shared_mem_size_tgemm() {
-  return BLOCK_SIZE_GEMM * THREADS_GEMM * 2 * sizeof(Torus);
-}
-
-__host__ inline bool can_use_pks_fast_path(uint32_t lwe_dimension,
-                                           uint32_t num_lwe,
-                                           uint32_t polynomial_size,
-                                           uint32_t level_count,
-                                           uint32_t glwe_dimension) {
-  // TODO: activate it back, fix tests and extend to level_count > 1
-  return false;
-}
-
-// Initialize decomposition by performing rounding
-// and decomposing one level of an array of Torus LWEs. Only
-// decomposes the mask elements of the incoming LWEs.
-template <typename Torus, typename TorusVec>
-__global__ void decompose_vectorize_init(Torus const *lwe_in, Torus *lwe_out,
-                                         uint32_t lwe_dimension,
-                                         uint32_t num_lwe, uint32_t base_log,
-                                         uint32_t level_count) {
-
-  // index of this LWE ct in the buffer
-  auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  // index of the LWE sample in the LWE ct
-  auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension)
-    return;
-
-  // Input LWE array is [mask_0, .., mask_lwe_dim, message] and
-  // we only decompose the mask. Thus the stride for reading
-  // is lwe_dimension + 1, while for writing it is lwe_dimension
-  auto read_val_idx = lwe_idx * (lwe_dimension + 1) + lwe_sample_idx;
-  auto write_val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
-
-  Torus a_i = lwe_in[read_val_idx];
-
-  Torus state = init_decomposer_state(a_i, base_log, level_count);
-
-  Torus mod_b_mask = (1ll << base_log) - 1ll;
-  lwe_out[write_val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
-}
-
-// Continue decomposiion of an array of Torus elements in place. Supposes
-// that the array contains already decomposed elements and
-// computes the new decomposed level in place.
-template <typename Torus, typename TorusVec>
-__global__ void
-decompose_vectorize_step_inplace(Torus *buffer_in, uint32_t lwe_dimension,
-                                 uint32_t num_lwe, uint32_t base_log,
-                                 uint32_t level_count) {
-
-  // index of this LWE ct in the buffer
-  auto lwe_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  // index of the LWE sample in the LWE ct
-  auto lwe_sample_idx = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (lwe_idx >= num_lwe || lwe_sample_idx >= lwe_dimension)
-    return;
-
-  auto val_idx = lwe_idx * lwe_dimension + lwe_sample_idx;
-
-  Torus state = buffer_in[val_idx];
-
-  Torus mod_b_mask = (1ll << base_log) - 1ll;
-
-  buffer_in[val_idx] = decompose_one<Torus>(state, mod_b_mask, base_log);
-}
-
-// Multiply matrices A, B of size (M, K), (K, N) respectively
-// with K as the inner dimension.
-//
-// A block of threads processeds blocks of size (BLOCK_SIZE_GEMM,
-// BLOCK_SIZE_GEMM) splitting them in multiple tiles: (BLOCK_SIZE_GEMM,
-// THREADS_GEMM)-shaped tiles of values from A, and a (THREADS_GEMM,
-// BLOCK_SIZE_GEMM)-shaped tiles of values from B.
-template <typename Torus, typename TorusVec>
-__global__ void tgemm(int M, int N, int K, const Torus *A, const Torus *B,
-                      int stride_B, Torus *C) {
-
-  const int BM = BLOCK_SIZE_GEMM;
-  const int BN = BLOCK_SIZE_GEMM;
-  const int BK = THREADS_GEMM;
-  const int TM = THREADS_GEMM;
-
-  const uint cRow = blockIdx.y;
-  const uint cCol = blockIdx.x;
-
-  const uint totalResultsBlocktile = BM * BN;
-  const int threadCol = threadIdx.x % BN;
-  const int threadRow = threadIdx.x / BN;
-
-  // Allocate space for the current block tile in shared memory
-  __shared__ Torus As[BM * BK];
-  __shared__ Torus Bs[BK * BN];
-
-  // Initialize the pointers to the input blocks from A, B
-  // Tiles from these blocks are loaded to shared memory
-  A += cRow * BM * K;
-  B += cCol * BN;
-
-  // Each thread will handle multiple sub-blocks
-  const uint innerColA = threadIdx.x % BK;
-  const uint innerRowA = threadIdx.x / BK;
-  const uint innerColB = threadIdx.x % BN;
-  const uint innerRowB = threadIdx.x / BN;
-
-  // allocate thread-local cache for results in registerfile
-  Torus threadResults[TM] = {0};
-
-  auto row_A = cRow * BM + innerRowA;
-  auto col_B = cCol * BN + innerColB;
-
-  // For each thread, loop over block tiles
-  for (uint bkIdx = 0; bkIdx < K; bkIdx += BK) {
-    auto col_A = bkIdx + innerColA;
-    auto row_B = bkIdx + innerRowB;
-
-    if (row_A < M && col_A < K) {
-      As[innerRowA * BK + innerColA] = A[innerRowA * K + innerColA];
-    } else {
-      As[innerRowA * BK + innerColA] = 0;
-    }
-
-    if (col_B < N && row_B < K) {
-      Bs[innerRowB * BN + innerColB] = B[innerRowB * stride_B + innerColB];
-    } else {
-      Bs[innerRowB * BN + innerColB] = 0;
-    }
-    __syncthreads();
-
-    // Advance blocktile for the next iteration of this loop
-    A += BK;
-    B += BK * stride_B;
-
-    // calculate per-thread results
-    for (uint dotIdx = 0; dotIdx < BK; ++dotIdx) {
-      // we make the dotproduct loop the outside loop, which facilitates
-      // reuse of the Bs entry, which we can cache in a tmp var.
-      Torus tmp = Bs[dotIdx * BN + threadCol];
-      for (uint resIdx = 0; resIdx < TM; ++resIdx) {
-        threadResults[resIdx] +=
-            As[(threadRow * TM + resIdx) * BK + dotIdx] * tmp;
-      }
-    }
-    __syncthreads();
-  }
-
-  // Initialize the pointer to the output block of size (BLOCK_SIZE_GEMM,
-  // BLOCK_SIZE_GEMM)
-  C += cRow * BM * N + cCol * BN;
-
-  // write out the results
-  for (uint resIdx = 0; resIdx < TM; ++resIdx) {
-    int outRow = cRow * BM + threadRow * TM + resIdx;
-    int outCol = cCol * BN + threadCol;
-
-    if (outRow >= M)
-      continue;
-    if (outCol >= N)
-      continue;
-
-    C[(threadRow * TM + resIdx) * N + threadCol] += threadResults[resIdx];
-  }
-}
-
-// Finish the keyswitching operation and prepare GLWEs for accumulation.
-// 1. Finish the keyswitching computation partially performed with a GEMM:
-//  - negate the dot product between the GLWE and KSK polynomial
-//  - add the GLWE message for the N-th polynomial coeff in the message poly
-// 2. Rotate each of the GLWE . KSK poly dot products to
-//    prepare them for accumulation into a single GLWE
-template <typename Torus>
-__global__ void polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C(
-    Torus *in_glwe_buffer, Torus *out_glwe_buffer, Torus const *lwe_array,
-    uint32_t lwe_dimension, uint32_t num_glwes, uint32_t polynomial_size,
-    uint32_t glwe_dimension) {
-
-  uint32_t glwe_id = blockIdx.x * blockDim.x + threadIdx.x;
-  uint32_t degree = glwe_id; // lwe 0 rotate 0, lwe 1 rotate 1, .. , lwe
-                             // poly_size-1 rotate poly_size-1
-  uint32_t coeffIdx = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (glwe_id >= num_glwes)
-    return;
-  if (coeffIdx >= polynomial_size)
-    return;
-
-  auto in_poly =
-      in_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1);
-  auto out_result =
-      out_glwe_buffer + glwe_id * polynomial_size * (glwe_dimension + 1);
-  if (coeffIdx == 0) {
-    // Add the message value of the input LWE (`C`) to the N-th coefficient
-    // in the GLWE . KSK dot product
-
-    // The C is added to the first position of the last polynomial in the GLWE
-    // which has (glwe_dimension+1) polynomials
-    // The C value is extracted as the last value of the LWE ct. (of index
-    // glwe_id) the LWEs have (polynomial_size + 1) values
-    in_poly[polynomial_size * glwe_dimension] =
-        lwe_array[glwe_id * (lwe_dimension + 1) + lwe_dimension] -
-        in_poly[polynomial_size * glwe_dimension];
-
-    for (int gi = 1; gi < glwe_dimension; ++gi)
-      in_poly[coeffIdx + gi * polynomial_size] =
-          -in_poly[coeffIdx + gi * polynomial_size];
-
-  } else {
-    // Otherwise simply negate the input coefficient
-    for (int gi = 1; gi < glwe_dimension + 1; ++gi)
-      in_poly[coeffIdx + gi * polynomial_size] =
-          -in_poly[coeffIdx + gi * polynomial_size];
-  }
-  // Negate all the coefficients for rotation for the first poly
-  in_poly[coeffIdx] = -in_poly[coeffIdx];
-
-  // rotate the body
-  polynomial_accumulate_monic_monomial_mul<Torus>(
-      out_result, in_poly, degree, coeffIdx, polynomial_size, 1, true);
-  // rotate the mask too
-  for (int gi = 1; gi < glwe_dimension + 1; ++gi)
-    polynomial_accumulate_monic_monomial_mul<Torus>(
-        out_result + gi * polynomial_size, in_poly + gi * polynomial_size,
-        degree, coeffIdx, polynomial_size, 1, true);
-}
-
-template <typename Torus, typename TorusVec>
-__host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(
-    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
-    Torus const *lwe_array_in, Torus const *fp_ksk_array, int8_t *fp_ks_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
-
-  // Optimization of packing keyswitch when packing many LWEs
-
-  if (level_count > 1) {
-    PANIC("Fast path PKS only supports level_count==1");
-  }
-
-  cudaSetDevice(gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
-
-  // The fast path of PKS uses the scratch buffer (d_mem) differently than the
-  // old path: it needs to store the decomposed masks in the first half of this
-  // buffer and the keyswitched GLWEs in the second half of the buffer. Thus the
-  // scratch buffer for the fast path must determine the half-size of the
-  // scratch buffer as the max between the size of the GLWE and the size of the
-  // LWE-mask
-  int memory_unit = glwe_accumulator_size > lwe_dimension
-                        ? glwe_accumulator_size
-                        : lwe_dimension;
-
-  // ping pong the buffer between successive calls
-  // split the buffer in two parts of this size
-  auto d_mem_0 = (Torus *)fp_ks_buffer;
-  auto d_mem_1 = d_mem_0 + num_lwes * memory_unit;
-
-  // Set the scratch buffer to 0 as it is used to accumulate
-  // decomposition temporary results
-  cuda_memset_async(d_mem_1, 0, num_lwes * memory_unit * sizeof(Torus), stream,
-                    gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  // decompose LWEs
-  // don't decompose LWE body - the LWE has lwe_size + 1 elements. The last
-  // element, the body is ignored by rounding down the number of blocks assuming
-  // here that the LWE dimension is a multiple of the block size
-  dim3 grid_decomp(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP),
-                   CEIL_DIV(lwe_dimension, BLOCK_SIZE_DECOMP));
-  dim3 threads_decomp(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
-
-  // decompose first level
-  decompose_vectorize_init<Torus, TorusVec>
-      <<<grid_decomp, threads_decomp, 0, stream>>>(lwe_array_in, d_mem_0,
-                                                   lwe_dimension, num_lwes,
-                                                   base_log, level_count);
-  check_cuda_error(cudaGetLastError());
-
-  // gemm to ks the individual LWEs to GLWEs
-  dim3 grid_gemm(CEIL_DIV(glwe_accumulator_size, BLOCK_SIZE_GEMM),
-                 CEIL_DIV(num_lwes, BLOCK_SIZE_GEMM));
-  dim3 threads_gemm(BLOCK_SIZE_GEMM * THREADS_GEMM);
-
-  auto stride_KSK_buffer = glwe_accumulator_size;
-
-  uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
-  tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
-      num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
-      stride_KSK_buffer, d_mem_1);
-  check_cuda_error(cudaGetLastError());
-
-  /*
-    TODO: transpose key to generalize to level_count > 1
-
-    for (int li = 1; li < level_count; ++li) {
-      decompose_vectorize_step_inplace<Torus, TorusVec>
-          <<<grid_decomp, threads_decomp, 0, stream>>>(
-              d_mem_0, lwe_dimension, num_lwes, base_log, level_count);
-      check_cuda_error(cudaGetLastError());
-
-      tgemm<Torus, TorusVec><<<grid_gemm, threads_gemm, shared_mem_size,
-    stream>>>( num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0,
-          fp_ksk_array + li * ksk_block_size, stride_KSK_buffer, d_mem_1);
-      check_cuda_error(cudaGetLastError());
-    }
-  */
-
-  // should we include the mask in the rotation ??
-  dim3 grid_rotate(CEIL_DIV(num_lwes, BLOCK_SIZE_DECOMP),
-                   CEIL_DIV(polynomial_size, BLOCK_SIZE_DECOMP));
-  dim3 threads_rotate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
-  // rotate the GLWEs
-  polynomial_accumulate_monic_monomial_mul_many_neg_and_add_C<Torus>
-      <<<grid_rotate, threads_rotate, 0, stream>>>(
-          d_mem_1, d_mem_0, lwe_array_in, lwe_dimension, num_lwes,
-          polynomial_size, glwe_dimension);
-  check_cuda_error(cudaGetLastError());
-
-  dim3 grid_accumulate(
-      CEIL_DIV(polynomial_size * (glwe_dimension + 1), BLOCK_SIZE_DECOMP));
-  dim3 threads_accum(BLOCK_SIZE_DECOMP);
-
-  // accumulate to a single glwe
-  accumulate_glwes<Torus><<<grid_accumulate, threads_accum, 0, stream>>>(
-      glwe_out, d_mem_0, glwe_dimension, polynomial_size, num_lwes);
-
-  check_cuda_error(cudaGetLastError());
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -1,8 +1,6 @@
-#include "fast_packing_keyswitch.cuh"
 #include "keyswitch.cuh"
 #include "keyswitch.h"
 #include <cstdint>
-#include <stdio.h>

 /* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
 * Head out to the equivalent operation on 64 bits for more details.
@@ -55,17 +53,15 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(

 void scratch_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t num_lwes, bool allocate_gpu_memory) {
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
  scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer, lwe_dimension,
+      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
      glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
 }
-
 /* Perform functional packing keyswitch on a batch of 64 bits input LWE
 * ciphertexts.
 */
-
 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    void *stream, uint32_t gpu_index, void *glwe_array_out,
    void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
@@ -73,24 +69,13 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
    uint32_t output_polynomial_size, uint32_t base_log, uint32_t level_count,
    uint32_t num_lwes) {

-  if (can_use_pks_fast_path(input_lwe_dimension, num_lwes,
-                            output_polynomial_size, level_count,
-                            output_glwe_dimension)) {
-    host_fast_packing_keyswitch_lwe_list_to_glwe<uint64_t, ulonglong4>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<uint64_t *>(glwe_array_out),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
-        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
-        base_log, level_count, num_lwes);
-  } else
-    host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
-        static_cast<cudaStream_t>(stream), gpu_index,
-        static_cast<uint64_t *>(glwe_array_out),
-        static_cast<const uint64_t *>(lwe_array_in),
-        static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
-        input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
-        base_log, level_count, num_lwes);
+  host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in),
+      static_cast<const uint64_t *>(fp_ksk_array), fp_ks_buffer,
+      input_lwe_dimension, output_glwe_dimension, output_polynomial_size,
+      base_log, level_count, num_lwes);
 }

 void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -158,20 +158,16 @@ void execute_keyswitch_async(cudaStream_t const *streams,
 template <typename Torus>
 __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t num_lwes, bool allocate_gpu_memory) {
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
  cudaSetDevice(gpu_index);

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

-  int memory_unit = glwe_accumulator_size > lwe_dimension
-                        ? glwe_accumulator_size
-                        : lwe_dimension;
-
-  if (allocate_gpu_memory) {
+  if (allocate_gpu_memory)
    *fp_ks_buffer = (int8_t *)cuda_malloc_async(
-        2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index);
-  }
+        2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
+        gpu_index);
 }

 // public functional packing keyswitch for a single LWE ciphertext
@@ -245,7 +241,6 @@ __global__ void packing_keyswitch_lwe_list_to_glwe(
  auto lwe_in = lwe_array_in + input_id * lwe_size;
  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
-
  // KS LWE to GLWE
  packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext<Torus>(
      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
@@ -298,18 +293,8 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
  dim3 grid(num_blocks, num_lwes);
  dim3 threads(num_threads);

-  // The fast path of PKS uses the scratch buffer (d_mem) differently:
-  // it needs to store the decomposed masks in the first half of this buffer
-  // and the keyswitched GLWEs in the second half of the buffer. Thus the
-  // scratch buffer for the fast path must determine the half-size of the
-  // scratch buffer as the max between the size of the GLWE and the size of the
-  // LWE-mask
-  int memory_unit = glwe_accumulator_size > lwe_dimension_in
-                        ? glwe_accumulator_size
-                        : lwe_dimension_in;
-
  auto d_mem = (Torus *)fp_ks_buffer;
-  auto d_tmp_glwe_array_out = d_mem + num_lwes * memory_unit;
+  auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;

  // individually keyswitch each lwe
  packing_keyswitch_lwe_list_to_glwe<Torus><<<grid, threads, 0, stream>>>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -37,32 +37,39 @@ __host__ void host_integer_radix_cmux_kb(
    uint32_t num_radix_blocks) {

  auto params = mem_ptr->params;
-  Torus lwe_size = params.big_lwe_dimension + 1;
-  Torus radix_lwe_size = lwe_size * num_radix_blocks;
-  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in, lwe_array_true,
-                               radix_lwe_size * sizeof(Torus), streams[0],
-                               gpu_indexes[0]);
-  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in + radix_lwe_size,
-                               lwe_array_false, radix_lwe_size * sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
-  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    cuda_memcpy_async_gpu_to_gpu(mem_ptr->condition_array + i * lwe_size,
-                                 lwe_condition, lwe_size * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
+
+  // Since our CPU threads will be working on different streams we shall assert
+  // the work in the main stream is completed
+  auto true_streams = mem_ptr->zero_if_true_buffer->true_streams;
+  auto false_streams = mem_ptr->zero_if_false_buffer->false_streams;
+  for (uint j = 0; j < gpu_count; j++) {
+    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
+  }
+
+  auto mem_true = mem_ptr->zero_if_true_buffer;
+  zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
+                     lwe_array_true, lwe_condition, mem_true,
+                     mem_ptr->inverted_predicate_lut, bsks, ksks,
+                     num_radix_blocks);
+  auto mem_false = mem_ptr->zero_if_false_buffer;
+  zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
+                     mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
+                     mem_false, mem_ptr->predicate_lut, bsks, ksks,
+                     num_radix_blocks);
+  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
+    cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
+  }
+  for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
+    cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
  }
-  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
-      mem_ptr->condition_array, bsks, ksks, 2 * num_radix_blocks,
-      mem_ptr->predicate_lut, params.message_modulus);

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
-  auto mem_true = mem_ptr->buffer_out;
-  auto mem_false = &mem_ptr->buffer_out[radix_lwe_size];
-  auto added_cts = mem_true;
-  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
-                       mem_false, params.big_lwe_dimension, num_radix_blocks);
+  auto added_cts = mem_ptr->tmp_true_ct;
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
+                       mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
+                       params.big_lwe_dimension, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
@@ -58,9 +58,6 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
  case GE:
  case LT:
  case LE:
-    if (num_radix_blocks % 2 != 0)
-      PANIC("Cuda error (comparisons): the number of radix blocks has to be "
-            "even.")
    host_integer_radix_difference_check_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
@@ -71,8 +68,6 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
    break;
  case MAX:
  case MIN:
-    if (num_radix_blocks % 2 != 0)
-      PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
    host_integer_radix_maxmin_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
@@ -94,91 +89,3 @@ void cleanup_cuda_integer_comparison(void *const *streams,
      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
-
-void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
-      false, allocate_gpu_memory);
-}
-
-void cuda_integer_are_all_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
-
-  int_comparison_buffer<uint64_t> *buffer =
-      (int_comparison_buffer<uint64_t> *)mem_ptr;
-
-  host_integer_are_all_comparisons_block_true_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in), buffer, bsks,
-      (uint64_t **)(ksks), num_radix_blocks);
-}
-
-void cleanup_cuda_integer_are_all_comparisons_block_true(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_comparison_buffer<uint64_t> *mem_ptr =
-      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
-
-void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          big_lwe_dimension, small_lwe_dimension, ks_level,
-                          ks_base_log, pbs_level, pbs_base_log, grouping_factor,
-                          message_modulus, carry_modulus);
-
-  scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
-      false, allocate_gpu_memory);
-}
-
-void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
-
-  int_comparison_buffer<uint64_t> *buffer =
-      (int_comparison_buffer<uint64_t> *)mem_ptr;
-
-  host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      static_cast<uint64_t *>(lwe_array_out),
-      static_cast<const uint64_t *>(lwe_array_in), buffer, bsks,
-      (uint64_t **)(ksks), num_radix_blocks);
-}
-
-void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void) {
-
-  int_comparison_buffer<uint64_t> *mem_ptr =
-      (int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
-  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -58,7 +58,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
 template <typename Torus>
 __host__ void are_all_comparisons_block_true(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
+    uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
    Torus *const *ksks, uint32_t num_radix_blocks) {

@@ -85,19 +85,16 @@ __host__ void are_all_comparisons_block_true(

  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    uint32_t chunk_length = std::min(max_value, remaining_blocks);
+    int num_chunks = remaining_blocks / chunk_length;

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
    auto input_blocks = tmp_out;
    auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
-    auto is_max_value_lut = are_all_block_true_buffer->is_max_value;
-    uint32_t chunk_lengths[num_chunks];
-    auto begin_remaining_blocks = remaining_blocks;
+    auto is_equal_to_num_blocks_map =
+        &are_all_block_true_buffer->is_equal_to_lut_map;
    for (int i = 0; i < num_chunks; i++) {
-      uint32_t chunk_length =
-          std::min(max_value, begin_remaining_blocks - i * max_value);
-      chunk_lengths[i] = chunk_length;
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
                                   input_blocks, big_lwe_dimension,
                                   chunk_length);
@@ -114,33 +111,29 @@ __host__ void are_all_comparisons_block_true(
      // is_non_zero_lut_buffer LUT
      lut = mem_ptr->eq_buffer->is_non_zero_lut;
    } else {
-      if (chunk_lengths[num_chunks - 1] != max_value) {
+      if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
+          (*is_equal_to_num_blocks_map).end()) {
+        // The LUT is already computed
+        lut = (*is_equal_to_num_blocks_map)[chunk_length];
+      } else {
        // LUT needs to be computed
-        uint32_t chunk_length = chunk_lengths[num_chunks - 1];
+        auto new_lut =
+            new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
+                                     max_value, num_radix_blocks, true);
+
        auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
          return x == chunk_length;
        };
        generate_device_accumulator<Torus>(
-            streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
+            streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
            glwe_dimension, polynomial_size, message_modulus, carry_modulus,
            is_equal_to_num_blocks_lut_f);

-        Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus));
-        for (int index = 0; index < num_chunks; index++) {
-          if (index == num_chunks - 1) {
-            h_lut_indexes[index] = 1;
-          } else {
-            h_lut_indexes[index] = 0;
-          }
-        }
-        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
-                                 h_lut_indexes, num_chunks * sizeof(Torus),
-                                 streams[0], gpu_indexes[0]);
-        is_max_value_lut->broadcast_lut(streams, gpu_indexes, 0);
-        cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-        free(h_lut_indexes);
+        new_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+
+        (*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
+        lut = new_lut;
      }
-      lut = is_max_value_lut;
    }

    // Applies the LUT
@@ -167,7 +160,7 @@ __host__ void are_all_comparisons_block_true(
 template <typename Torus>
 __host__ void is_at_least_one_comparisons_block_true(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
+    uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
    Torus *const *ksks, uint32_t num_radix_blocks) {

@@ -189,18 +182,14 @@ __host__ void is_at_least_one_comparisons_block_true(
  uint32_t remaining_blocks = num_radix_blocks;
  while (remaining_blocks > 0) {
    // Split in max_value chunks
-    int num_chunks = (remaining_blocks + max_value - 1) / max_value;
+    uint32_t chunk_length = std::min(max_value, remaining_blocks);
+    int num_chunks = remaining_blocks / chunk_length;

    // Since all blocks encrypt either 0 or 1, we can sum max_value of them
    // as in the worst case we will be adding `max_value` ones
    auto input_blocks = mem_ptr->tmp_lwe_array_out;
    auto accumulator = buffer->tmp_block_accumulated;
-    uint32_t chunk_lengths[num_chunks];
-    auto begin_remaining_blocks = remaining_blocks;
    for (int i = 0; i < num_chunks; i++) {
-      uint32_t chunk_length =
-          std::min(max_value, begin_remaining_blocks - i * max_value);
-      chunk_lengths[i] = chunk_length;
      accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
                                   input_blocks, big_lwe_dimension,
                                   chunk_length);
@@ -460,9 +449,9 @@ __host__ void tree_sign_reduction(
    f = sign_handler_f;
  }
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
-      polynomial_size, message_modulus, carry_modulus, f);
-  last_lut->broadcast_lut(streams, gpu_indexes, 0);
+      streams[0], gpu_indexes[0], last_lut->get_lut(gpu_indexes[0], 0),
+      glwe_dimension, polynomial_size, message_modulus, carry_modulus, f);
+  last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  // Last leaf
  integer_radix_apply_univariate_lookup_table_kb<Torus>(
@@ -492,9 +481,8 @@ __host__ void host_integer_radix_difference_check_kb(
  if (carry_modulus >= message_modulus) {
    // Packing is possible
    // Pack inputs
-    Torus *packed_left = diff_buffer->tmp_packed;
-    Torus *packed_right =
-        diff_buffer->tmp_packed + num_radix_blocks / 2 * big_lwe_size;
+    Torus *packed_left = diff_buffer->tmp_packed_left;
+    Torus *packed_right = diff_buffer->tmp_packed_right;
    // In case the ciphertext is signed, the sign block and the one before it
    // are handled separately
    if (mem_ptr->is_signed) {
@@ -513,7 +501,10 @@ __host__ void host_integer_radix_difference_check_kb(
    auto identity_lut = mem_ptr->identity_lut;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
-        2 * packed_num_radix_blocks, identity_lut);
+        packed_num_radix_blocks, identity_lut);
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
+        packed_num_radix_blocks, identity_lut);

    lhs = packed_left;
    rhs = packed_right;
@@ -542,13 +533,11 @@ __host__ void host_integer_radix_difference_check_kb(

      // Compare the last block before the sign block separately
      auto identity_lut = mem_ptr->identity_lut;
-      Torus *packed_left = diff_buffer->tmp_packed;
-      Torus *packed_right =
-          diff_buffer->tmp_packed + num_radix_blocks / 2 * big_lwe_size;
      Torus *last_left_block_before_sign_block =
-          packed_left + packed_num_radix_blocks * big_lwe_size;
+          diff_buffer->tmp_packed_left + packed_num_radix_blocks * big_lwe_size;
      Torus *last_right_block_before_sign_block =
-          packed_right + packed_num_radix_blocks * big_lwe_size;
+          diff_buffer->tmp_packed_right +
+          packed_num_radix_blocks * big_lwe_size;
      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
@@ -626,35 +615,4 @@ __host__ void host_integer_radix_maxmin_kb(
      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
 }

-template <typename Torus>
-__host__ void host_integer_are_all_comparisons_block_true_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
-
-  auto eq_buffer = mem_ptr->eq_buffer;
-
-  // It returns a block encrypting 1 if all input blocks are 1
-  // otherwise the block encrypts 0
-  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
-                                        lwe_array_out, lwe_array_in, mem_ptr,
-                                        bsks, ksks, num_radix_blocks);
-}
-
-template <typename Torus>
-__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
-    int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t num_radix_blocks) {
-
-  auto eq_buffer = mem_ptr->eq_buffer;
-
-  // It returns a block encrypting 1 if all input blocks are 1
-  // otherwise the block encrypts 0
-  is_at_least_one_comparisons_block_true<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
-      bsks, ksks, num_radix_blocks);
-}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -2,7 +2,6 @@
 #define CUDA_INTEGER_COMPRESSION_CUH

 #include "ciphertext.h"
-#include "crypto/fast_packing_keyswitch.cuh"
 #include "crypto/keyswitch.cuh"
 #include "device.h"
 #include "integer/compression/compression.h"
@@ -117,21 +116,11 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  while (rem_lwes > 0) {
    auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);

-    if (can_use_pks_fast_path(
-            input_lwe_dimension, chunk_size, compression_params.polynomial_size,
-            compression_params.ks_level, compression_params.glwe_dimension)) {
-      host_fast_packing_keyswitch_lwe_list_to_glwe<Torus, ulonglong4>(
-          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
-          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
-          compression_params.polynomial_size, compression_params.ks_base_log,
-          compression_params.ks_level, chunk_size);
-    } else {
-      host_packing_keyswitch_lwe_list_to_glwe<Torus>(
-          streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
-          fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
-          compression_params.polynomial_size, compression_params.ks_base_log,
-          compression_params.ks_level, chunk_size);
-    }
+    host_packing_keyswitch_lwe_list_to_glwe<Torus>(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+        compression_params.polynomial_size, compression_params.ks_base_log,
+        compression_params.ks_level, chunk_size);

    rem_lwes -= chunk_size;
    lwe_subset += chunk_size * lwe_in_size;
@@ -311,7 +300,7 @@ __host__ void host_integer_decompress(
  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
  /// dimension to a big LWE dimension
  auto encryption_params = h_mem_ptr->encryption_params;
-  auto lut = h_mem_ptr->decompression_rescale_lut;
+  auto lut = h_mem_ptr->carry_extract_lut;
  auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
  if (active_gpu_count == 1) {
    execute_pbs_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -198,27 +198,6 @@ void scratch_cuda_apply_univariate_lut_kb_64(
      allocate_gpu_memory);
 }

-void scratch_cuda_apply_many_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
-    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
-    uint32_t grouping_factor, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, bool allocate_gpu_memory) {
-
-  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
-                          glwe_dimension * polynomial_size, lwe_dimension,
-                          ks_level, ks_base_log, pbs_level, pbs_base_log,
-                          grouping_factor, message_modulus, carry_modulus);
-
-  scratch_cuda_apply_many_univariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_radix_lut<uint64_t> **)mem_ptr,
-      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      num_many_lut, allocate_gpu_memory);
-}
-
 void cuda_apply_univariate_lut_kb_64(void *const *streams,
                                     uint32_t const *gpu_indexes,
                                     uint32_t gpu_count, void *output_radix_lwe,
@@ -258,7 +237,7 @@ void cuda_apply_many_univariate_lut_kb_64(

 void scratch_cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
+    int8_t **mem_ptr, void *input_lut, uint32_t lwe_dimension,
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level,
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
@@ -272,9 +251,8 @@ void scratch_cuda_apply_bivariate_lut_kb_64(

  scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_radix_lut<uint64_t> **)mem_ptr,
-      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      allocate_gpu_memory);
+      (int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
+      num_radix_blocks, params, allocate_gpu_memory);
 }

 void cuda_apply_bivariate_lut_kb_64(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -627,48 +627,26 @@ void rotate_left(Torus *buffer, int mid, uint32_t array_length) {
  std::rotate(buffer, buffer + mid, buffer + array_length);
 }

-/// Caller needs to ensure that the operation applied is coherent from an
-/// encoding perspective.
-///
-/// For example:
-///
-/// Input encoding has 2 bits and output encoding has 4 bits, applying the
-/// identity lut would map the following:
-///
-/// 0|00|xx -> 0|00|00
-/// 0|01|xx -> 0|00|01
-/// 0|10|xx -> 0|00|10
-/// 0|11|xx -> 0|00|11
-///
-/// The reason is the identity function is computed in the input space but the
-/// scaling is done in the output space, as there are more bits in the output
-/// space, the delta is smaller hence the apparent "division" happening.
 template <typename Torus>
-void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension,
-                                         uint32_t polynomial_size,
-                                         uint32_t input_message_modulus,
-                                         uint32_t input_carry_modulus,
-                                         uint32_t output_message_modulus,
-                                         uint32_t output_carry_modulus,
-                                         std::function<Torus(Torus)> f) {
+void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
+                           uint32_t polynomial_size, uint32_t message_modulus,
+                           uint32_t carry_modulus,
+                           std::function<Torus(Torus)> f) {

-  uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus;
-  uint32_t output_modulus_sup = output_message_modulus * output_carry_modulus;
-  uint32_t box_size = polynomial_size / input_modulus_sup;
-  auto nbits = sizeof(Torus) * 8;
-  Torus output_delta =
-      (static_cast<Torus>(1) << (nbits - 1)) / output_modulus_sup;
+  uint32_t modulus_sup = message_modulus * carry_modulus;
+  uint32_t box_size = polynomial_size / modulus_sup;
+  Torus delta = (1ul << 63) / modulus_sup;

  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

  auto body = &acc[glwe_dimension * polynomial_size];

  // This accumulator extracts the carry bits
-  for (int i = 0; i < input_modulus_sup; i++) {
+  for (int i = 0; i < modulus_sup; i++) {
    int index = i * box_size;
    for (int j = index; j < index + box_size; j++) {
      auto f_eval = f(i);
-      body[j] = f_eval * output_delta;
+      body[j] = f_eval * delta;
    }
  }

@@ -682,16 +660,6 @@ void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension,
  rotate_left<Torus>(body, half_box_size, polynomial_size);
 }

-template <typename Torus>
-void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
-                           uint32_t polynomial_size, uint32_t message_modulus,
-                           uint32_t carry_modulus,
-                           std::function<Torus(Torus)> f) {
-  generate_lookup_table_with_encoding(acc, glwe_dimension, polynomial_size,
-                                      message_modulus, carry_modulus,
-                                      message_modulus, carry_modulus, f);
-}
-
 template <typename Torus>
 void generate_many_lookup_table(
    Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -700,8 +668,7 @@ void generate_many_lookup_table(

  uint32_t modulus_sup = message_modulus * carry_modulus;
  uint32_t box_size = polynomial_size / modulus_sup;
-  auto nbits = sizeof(Torus) * 8;
-  Torus delta = (static_cast<Torus>(1) << (nbits - 1)) / modulus_sup;
+  Torus delta = (1ul << 63) / modulus_sup;

  memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));

@@ -836,32 +803,6 @@ void generate_device_accumulator_bivariate_with_factor(
  free(h_lut);
 }

-template <typename Torus>
-void generate_device_accumulator_with_encoding(
-    cudaStream_t stream, uint32_t gpu_index, Torus *acc,
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_message_modulus, uint32_t input_carry_modulus,
-    uint32_t output_message_modulus, uint32_t output_carry_modulus,
-    std::function<Torus(Torus)> f) {
-
-  // host lut
-  Torus *h_lut =
-      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-
-  // fill accumulator
-  generate_lookup_table_with_encoding<Torus>(
-      h_lut, glwe_dimension, polynomial_size, input_message_modulus,
-      input_carry_modulus, output_message_modulus, output_carry_modulus, f);
-
-  // copy host lut and lut_indexes_vec to device
-  cuda_memcpy_async_to_gpu(
-      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
-      stream, gpu_index);
-
-  cuda_synchronize_stream(stream, gpu_index);
-  free(h_lut);
-}
-
 /*
 *  generate accumulator for device pointer
 *    v_stream - cuda stream
@@ -877,9 +818,21 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
                                 uint32_t carry_modulus,
                                 std::function<Torus(Torus)> f) {

-  generate_device_accumulator_with_encoding(
-      stream, gpu_index, acc, glwe_dimension, polynomial_size, message_modulus,
-      carry_modulus, message_modulus, carry_modulus, f);
+  // host lut
+  Torus *h_lut =
+      (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
+
+  // fill accumulator
+  generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
+                               message_modulus, carry_modulus, f);
+
+  // copy host lut and lut_indexes_vec to device
+  cuda_memcpy_async_to_gpu(
+      acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
+      stream, gpu_index);
+
+  cuda_synchronize_stream(stream, gpu_index);
+  free(h_lut);
 }

 /*
@@ -1102,8 +1055,7 @@ void host_compute_propagation_simulators_and_group_carries(
      message_modulus, carry_modulus);

  uint32_t modulus_sup = message_modulus * carry_modulus;
-  auto nbits = sizeof(Torus) * 8;
-  Torus delta = (static_cast<Torus>(1) << (nbits - 1)) / modulus_sup;
+  Torus delta = (1ull << 63) / modulus_sup;
  auto simulators = mem->simulators;
  auto grouping_pgns = mem->grouping_pgns;
  host_radix_split_simulators_and_grouping_pgns<Torus>(
@@ -1430,8 +1382,8 @@ __host__ void
 create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
                     Torus *lwe_array_out, Torus const *scalar_array,
                     uint32_t lwe_dimension, uint32_t num_radix_blocks,
-                     uint32_t num_scalar_blocks, Torus message_modulus,
-                     Torus carry_modulus) {
+                     uint32_t num_scalar_blocks, uint64_t message_modulus,
+                     uint64_t carry_modulus) {

  cudaSetDevice(gpu_index);
  size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
@@ -1451,9 +1403,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
  // Value of the shift we multiply our messages by
  // If message_modulus and carry_modulus are always powers of 2 we can simplify
  // this
-  auto nbits = sizeof(Torus) * 8;
-  Torus delta = (static_cast<Torus>(1) << (nbits - 1)) /
-                (message_modulus * carry_modulus);
+  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

  device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
      lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
@@ -1513,10 +1463,10 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,
  if (num_sign_blocks > 2) {
    auto lut = diff_buffer->reduce_signs_lut;
    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        reduce_two_orderings_function);
-    lut->broadcast_lut(streams, gpu_indexes, 0);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    while (num_sign_blocks > 2) {
      pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
@@ -1547,9 +1497,10 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,

    auto lut = diff_buffer->reduce_signs_lut;
    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, final_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, 0);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        final_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
                       big_lwe_dimension, 2, 4);
@@ -1566,9 +1517,10 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes,

    auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus, final_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, 0);
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        final_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
@@ -1587,11 +1539,11 @@ void scratch_cuda_apply_univariate_lut_kb(
                                      1, num_radix_blocks, allocate_gpu_memory);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
-  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(0, 0), (void *)input_lut,
-                           (params.glwe_dimension + 1) *
-                               params.polynomial_size * sizeof(Torus),
-                           streams[0], gpu_indexes[0]);
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
+  cuda_memcpy_async_to_gpu(
+      (*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
+      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
+      streams[0], gpu_indexes[0]);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 }

 template <typename Torus>
@@ -1607,25 +1559,6 @@ void host_apply_univariate_lut_kb(cudaStream_t const *streams,
      num_blocks, mem);
 }

-template <typename Torus>
-void scratch_cuda_apply_many_univariate_lut_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
-    uint32_t num_radix_blocks, int_radix_params params, uint32_t num_many_lut,
-    bool allocate_gpu_memory) {
-
-  *mem_ptr = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
-                                      1, num_radix_blocks, num_many_lut,
-                                      allocate_gpu_memory);
-  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
-  // 0
-  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(0, 0), (void *)input_lut,
-                           (params.glwe_dimension + 1) *
-                               params.polynomial_size * sizeof(Torus),
-                           streams[0], gpu_indexes[0]);
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
-}
-
 template <typename Torus>
 void host_apply_many_univariate_lut_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1649,11 +1582,11 @@ void scratch_cuda_apply_bivariate_lut_kb(
                                      1, num_radix_blocks, allocate_gpu_memory);
  // It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
  // 0
-  cuda_memcpy_async_to_gpu((*mem_ptr)->get_lut(0, 0), (void *)input_lut,
-                           (params.glwe_dimension + 1) *
-                               params.polynomial_size * sizeof(Torus),
-                           streams[0], gpu_indexes[0]);
-  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0);
+  cuda_memcpy_async_to_gpu(
+      (*mem_ptr)->get_lut(gpu_indexes[0], 0), (void *)input_lut,
+      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
+      streams[0], gpu_indexes[0]);
+  (*mem_ptr)->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 }

 template <typename Torus>
@@ -1693,12 +1626,13 @@ void host_propagate_single_carry(cudaStream_t const *streams,
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
  auto lut_stride = mem->lut_stride;
  auto num_many_lut = mem->num_many_lut;
-  auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks;
  if (requested_flag == outputFlag::FLAG_OVERFLOW)
    PANIC("Cuda error: single carry propagation is not supported for overflow, "
          "try using add_and_propagate_single_carry");
@@ -1715,7 +1649,7 @@ void host_propagate_single_carry(cudaStream_t const *streams,

  if (requested_flag == outputFlag::FLAG_CARRY) {
    cuda_memcpy_async_gpu_to_gpu(
-        output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
+        mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
  }
  // Step 2
@@ -1735,40 +1669,45 @@ void host_propagate_single_carry(cudaStream_t const *streams,

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
+                         mem->output_flag,
                         mem->prop_simu_group_carries_mem->simulators +
                             (num_radix_blocks - 1) * big_lwe_size,
                         big_lwe_dimension, 1);
  }

+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+  // Step 3
+  //  Add carries and cleanup OutputFlag::None
  host_radix_sum_in_groups<Torus>(
-      streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
      mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
      big_lwe_size, group_size);
+
+  auto message_extract = mem->lut_message_extract;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      mem->sub_streams_1, gpu_indexes, gpu_count, lwe_array, prepared_blocks,
+      bsks, ksks, num_radix_blocks, message_extract);
+
  if (requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
+    host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
+                         mem->output_flag, mem->output_flag,
                         mem->prop_simu_group_carries_mem->resolved_carries +
                             (mem->num_groups - 1) * big_lwe_size,
                         big_lwe_dimension, 1);

-    cuda_memcpy_async_gpu_to_gpu(
-        prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
-        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
-        bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract);
+        mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+        mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);

-    cuda_memcpy_async_gpu_to_gpu(lwe_array, mem->output_flag,
-                                 big_lwe_size_bytes * num_radix_blocks,
-                                 streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_gpu_to_gpu(
-        carry_out, mem->output_flag + num_radix_blocks * big_lwe_size,
-        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-  } else {
-    auto message_extract = mem->lut_message_extract;
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks,
-        num_radix_blocks, message_extract);
+    cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
+                                 big_lwe_size_bytes, mem->sub_streams_2[0],
+                                 gpu_indexes[0]);
+  }
+  for (int j = 0; j < mem->active_gpu_count; j++) {
+    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
  }
 }

@@ -1784,12 +1723,13 @@ void host_add_and_propagate_single_carry(
  auto params = mem->params;
  auto glwe_dimension = params.glwe_dimension;
  auto polynomial_size = params.polynomial_size;
+  auto message_modulus = params.message_modulus;
+  auto carry_modulus = params.carry_modulus;
  uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
  auto big_lwe_dimension = big_lwe_size - 1; // For host addition
  auto lut_stride = mem->lut_stride;
  auto num_many_lut = mem->num_many_lut;
-  auto output_flag = mem->output_flag + big_lwe_size * num_radix_blocks;

  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
    cuda_memcpy_async_gpu_to_gpu(
@@ -1816,12 +1756,12 @@ void host_add_and_propagate_single_carry(
  if (requested_flag == outputFlag::FLAG_OVERFLOW) {
    auto lut_overflow_prep = mem->lut_overflow_flag_prep;
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, output_flag, mem->last_lhs,
+        streams, gpu_indexes, gpu_count, mem->output_flag, mem->last_lhs,
        mem->last_rhs, bsks, ksks, 1, lut_overflow_prep,
        lut_overflow_prep->params.message_modulus);
  } else if (requested_flag == outputFlag::FLAG_CARRY) {
    cuda_memcpy_async_gpu_to_gpu(
-        output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
+        mem->output_flag, block_states + (num_radix_blocks - 1) * big_lwe_size,
        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
  }

@@ -1842,50 +1782,58 @@ void host_add_and_propagate_single_carry(

  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
-    host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
+    host_addition<Torus>(streams[0], gpu_indexes[0], mem->output_flag,
+                         mem->output_flag,
                         mem->prop_simu_group_carries_mem->simulators +
                             (num_radix_blocks - 1) * big_lwe_size,
                         big_lwe_dimension, 1);
  }

+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
  // Step 3
  //  Add carries and cleanup OutputFlag::None
  host_radix_sum_in_groups<Torus>(
-      streams[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
+      mem->sub_streams_1[0], gpu_indexes[0], prepared_blocks, prepared_blocks,
      mem->prop_simu_group_carries_mem->resolved_carries, num_radix_blocks,
      big_lwe_size, group_size);

+  auto message_extract = mem->lut_message_extract;
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      mem->sub_streams_1, gpu_indexes, gpu_count, lhs_array, prepared_blocks,
+      bsks, ksks, num_radix_blocks, message_extract);
+
  if (requested_flag == outputFlag::FLAG_OVERFLOW ||
      requested_flag == outputFlag::FLAG_CARRY) {
    if (num_radix_blocks == 1 && requested_flag == outputFlag::FLAG_OVERFLOW &&
        uses_carry == 1) {
-      host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
-                           input_carries, big_lwe_dimension, 1);
+      host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
+                           mem->output_flag, mem->output_flag, input_carries,
+                           big_lwe_dimension, 1);

    } else {

-      host_addition<Torus>(streams[0], gpu_indexes[0], output_flag, output_flag,
+      host_addition<Torus>(mem->sub_streams_2[0], gpu_indexes[0],
+                           mem->output_flag, mem->output_flag,
                           mem->prop_simu_group_carries_mem->resolved_carries +
                               (mem->num_groups - 1) * big_lwe_size,
                           big_lwe_dimension, 1);
    }
-    cuda_memcpy_async_gpu_to_gpu(
-        prepared_blocks + num_radix_blocks * big_lwe_size, output_flag,
-        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks,
-        bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract);
-
-    cuda_memcpy_async_gpu_to_gpu(lhs_array, mem->output_flag,
-                                 big_lwe_size_bytes * num_radix_blocks,
-                                 streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_gpu_to_gpu(
-        carry_out, mem->output_flag + num_radix_blocks * big_lwe_size,
-        big_lwe_size_bytes, streams[0], gpu_indexes[0]);
-  } else {
-    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks,
-        num_radix_blocks, mem->lut_message_extract);
+    if (requested_flag == outputFlag::FLAG_OVERFLOW) {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+          mem->output_flag, bsks, ksks, 1, mem->lut_overflow_flag_last);
+    } else {
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          mem->sub_streams_2, gpu_indexes, gpu_count, mem->output_flag,
+          mem->output_flag, bsks, ksks, 1, mem->lut_carry_flag_last);
+    }
+    cuda_memcpy_async_gpu_to_gpu(carry_out, mem->output_flag,
+                                 big_lwe_size_bytes, mem->sub_streams_2[0],
+                                 gpu_indexes[0]);
+  }
+  for (int j = 0; j < mem->active_gpu_count; j++) {
+    cuda_synchronize_stream(mem->sub_streams_1[j], gpu_indexes[j]);
+    cuda_synchronize_stream(mem->sub_streams_2[j], gpu_indexes[j]);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -267,8 +267,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
        streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
        2 * ch_amount * num_blocks, reused_lut);
  }
-  auto message_acc = luts_message_carry->get_lut(0, 0);
-  auto carry_acc = luts_message_carry->get_lut(0, 1);
+  auto message_acc = luts_message_carry->get_lut(gpu_indexes[0], 0);
+  auto carry_acc = luts_message_carry->get_lut(gpu_indexes[0], 1);

  // define functions for each accumulator
  auto lut_f_message = [message_modulus](Torus x) -> Torus {
@@ -285,7 +285,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  generate_device_accumulator<Torus>(
      streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
      message_modulus, carry_modulus, lut_f_carry);
-  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+  luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

  while (r > 2) {
    size_t cur_total_blocks = r * num_blocks;
@@ -334,10 +334,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    if (carry_count > 0)
      cuda_set_value_async<Torus>(
          streams[0], gpu_indexes[0],
-          luts_message_carry->get_lut_indexes(0, message_count), 1,
+          luts_message_carry->get_lut_indexes(gpu_indexes[0], message_count), 1,
          carry_count);

-    luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
+    luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    /// For multi GPU execution we create vectors of pointers for inputs and
    /// outputs
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -31,10 +31,10 @@ __host__ void host_integer_radix_scalar_bitop_kb(
  } else {
    // We have all possible LUTs pre-computed and we use the decomposed scalar
    // as index to recover the right one
-    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
-                                 num_clear_blocks * sizeof(Torus), streams[0],
-                                 gpu_indexes[0]);
-    lut->broadcast_lut(streams, gpu_indexes, 0);
+    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(gpu_indexes[0], 0),
+                                 clear_blocks, num_clear_blocks * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cu
@@ -22,9 +22,6 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
  case GE:
  case LT:
  case LE:
-    if (lwe_ciphertext_count % 2 != 0)
-      PANIC("Cuda error (scalar comparisons): the number of radix blocks has "
-            "to be even.")
    host_integer_radix_scalar_difference_check_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
@@ -35,9 +32,6 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
    break;
  case MAX:
  case MIN:
-    if (lwe_ciphertext_count % 2 != 0)
-      PANIC("Cuda error (scalar max/min): the number of radix blocks has to be "
-            "even.")
    host_integer_radix_scalar_maxmin_kb<uint64_t>(
        (cudaStream_t *)(streams), gpu_indexes, gpu_count,
        static_cast<uint64_t *>(lwe_array_out),
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -110,11 +110,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator<Torus>(streams[0], gpu_indexes[0],
-                                       lut->get_lut(0, 0), glwe_dimension,
-                                       polynomial_size, message_modulus,
-                                       carry_modulus, scalar_last_leaf_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, 0);
+    generate_device_accumulator<Torus>(
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
+        scalar_last_leaf_lut_f);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -141,9 +141,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    //////////////
    // lsb
-    Torus *lhs = diff_buffer->tmp_packed;
-    Torus *rhs =
-        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;

    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks,
@@ -195,10 +194,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, 0);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
@@ -211,9 +210,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    uint32_t num_lsb_radix_blocks = total_num_radix_blocks;
    uint32_t num_scalar_blocks = total_num_scalar_blocks;

-    Torus *lhs = diff_buffer->tmp_packed;
-    Torus *rhs =
-        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;

    pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks,
@@ -331,10 +329,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
-        polynomial_size, message_modulus, carry_modulus,
+        streams[0], gpu_indexes[0], lut->get_lut(gpu_indexes[0], 0),
+        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f);
-    lut->broadcast_lut(streams, gpu_indexes, 0);
+    lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
@@ -360,9 +358,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    //////////////
    // lsb
-    Torus *lhs = diff_buffer->tmp_packed;
-    Torus *rhs =
-        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;

    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks,
@@ -425,10 +422,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto signed_msb_lut = mem_ptr->signed_msb_lut;
    generate_device_accumulator_bivariate<Torus>(
-        msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
-        params.glwe_dimension, params.polynomial_size, params.message_modulus,
-        params.carry_modulus, lut_f);
-    signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);
+        msb_streams[0], gpu_indexes[0],
+        signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension,
+        params.polynomial_size, params.message_modulus, params.carry_modulus,
+        lut_f);
+    signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);

    Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
@@ -461,9 +459,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out;
    auto lwe_array_sign_out =
        lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size;
-    Torus *lhs = diff_buffer->tmp_packed;
-    Torus *rhs =
-        diff_buffer->tmp_packed + total_num_radix_blocks / 2 * big_lwe_size;
+    Torus *lhs = diff_buffer->tmp_packed_left;
+    Torus *rhs = diff_buffer->tmp_packed_right;

    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
                       big_lwe_dimension, num_lsb_radix_blocks - 1,
@@ -679,10 +676,10 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
    pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
                       scalar_blocks, 0, num_scalar_blocks, message_modulus);

-    cuda_memcpy_async_gpu_to_gpu(scalar_comparison_luts->get_lut_indexes(0, 0),
-                                 packed_scalar,
-                                 num_halved_scalar_blocks * sizeof(Torus),
-                                 lsb_streams[0], gpu_indexes[0]);
+    cuda_memcpy_async_gpu_to_gpu(
+        scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
+        packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
+        gpu_indexes[0]);
    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -36,7 +36,7 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(

  *mem_ptr =
      new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
-                                   num_radix_blocks, allocate_gpu_memory, true);
+                                   num_radix_blocks, allocate_gpu_memory);
 }

 template <typename T, class params>
@@ -94,11 +94,9 @@ __host__ void host_integer_scalar_mul_radix(
  }
  cuda_synchronize_stream(streams[0], gpu_indexes[0]);

-  if (mem->anticipated_buffers_drop) {
-    cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
-    mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
-    delete (mem->logical_scalar_shift_buffer);
-  }
+  cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
+  mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
+  delete (mem->logical_scalar_shift_buffer);

  if (j == 0) {
    // lwe array = 0
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -136,7 +136,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_tbc<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -144,7 +144,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -393,7 +393,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap_cg<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -401,7 +401,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap_cg<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -468,7 +468,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 512:
-    host_programmable_bootstrap<Torus, AmortizedDegree<512>>(
+    host_programmable_bootstrap<Torus, Degree<512>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -476,7 +476,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
        num_many_lut, lut_stride);
    break;
  case 1024:
-    host_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
+    host_programmable_bootstrap<Torus, Degree<1024>>(
        static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
        lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
        lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -237,7 +237,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
        (ClassicalProgrammableBootstrapTestParams){
            887, 1, 2048, new_t_uniform(46), new_t_uniform(17), 22, 1, 4, 4,
            100, 1, 1},
-        // V0_11_PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
+        // PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
        (ClassicalProgrammableBootstrapTestParams){
            977, 1, 8192, new_gaussian_from_std_dev(3.0144389706858286e-07),
            new_gaussian_from_std_dev(2.168404344971009e-19), 16, 2, 8, 8, 100,
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -163,29 +163,6 @@ extern "C" {
        allocate_gpu_memory: bool,
    );
 }
-extern "C" {
-    pub fn scratch_cuda_apply_many_univariate_lut_kb_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        input_lut: *const ffi::c_void,
-        lwe_dimension: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_radix_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        num_many_lut: u32,
-        allocate_gpu_memory: bool,
-    );
-}
 extern "C" {
    pub fn cuda_apply_univariate_lut_kb_64(
        streams: *const *mut ffi::c_void,
@@ -1106,92 +1083,6 @@ extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
-extern "C" {
-    pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_radix_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-    );
-}
-extern "C" {
-    pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_in: *const ffi::c_void,
-        mem_ptr: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-        num_radix_blocks: u32,
-    );
-}
-extern "C" {
-    pub fn cleanup_cuda_integer_are_all_comparisons_block_true(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
-extern "C" {
-    pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_radix_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: PBS_TYPE,
-        allocate_gpu_memory: bool,
-    );
-}
-extern "C" {
-    pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lwe_array_out: *mut ffi::c_void,
-        lwe_array_in: *const ffi::c_void,
-        mem_ptr: *mut i8,
-        bsks: *const *mut ffi::c_void,
-        ksks: *const *mut ffi::c_void,
-        num_radix_blocks: u32,
-    );
-}
-extern "C" {
-    pub fn cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
-        streams: *const *mut ffi::c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr_void: *mut *mut i8,
-    );
-}
 extern "C" {
    pub fn cuda_keyswitch_lwe_ciphertext_vector_32(
        stream: *mut ffi::c_void,
@@ -1229,7 +1120,6 @@ extern "C" {
        stream: *mut ffi::c_void,
        gpu_index: u32,
        fp_ks_buffer: *mut *mut i8,
-        lwe_dimension: u32,
        glwe_dimension: u32,
        polynomial_size: u32,
        num_lwes: u32,
--- a/ci/lattice_estimator.sage
+++ b/ci/lattice_estimator.sage
@@ -91,8 +91,6 @@ if __name__ == "__main__":
        "boolean_parameters_lattice_estimator.sage",
        "shortint_classic_parameters_lattice_estimator.sage",
        "shortint_multi_bit_parameters_lattice_estimator.sage",
-        "shortint_cpke_parameters_lattice_estimator.sage",
-        "shortint_list_compression_parameters_lattice_estimator.sage",
    ):
        to_update, to_watch = check_security(params_filename)
        params_to_update.extend(to_update)
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -31,7 +31,7 @@ instance_type = "m6i.4xlarge"
 [backend.hyperstack.gpu-test]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-flavor_name = "n3-L40x1"
+flavor_name = "n3-RTX-A6000x1"

 [backend.hyperstack.single-h100]
 environment_name = "canada"
@@ -58,12 +58,6 @@ environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
 flavor_name = "n3-H100x8-NVLink"

-
-[backend.hyperstack.multi-h100-sxm5]
-environment_name = "canada"
-image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-flavor_name = "n3-H100-SXM5x8"
-
 [backend.hyperstack.multi-a100-nvlink]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
--- a/scripts/get_arch_feature.sh
+++ b/scripts/get_arch_feature.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+set -e
+
+ARCH_FEATURE=x86_64
+
+IS_AARCH64="$( (uname -a | grep -c "arm64\|aarch64") || true)"
+
+if [[ "${IS_AARCH64}" != "0" ]]; then
+    ARCH_FEATURE=aarch64
+fi
+
+UNAME="$(uname)"
+
+if [[ "${UNAME}" == "Linux" || "${UNAME}" == "Darwin" ]]; then
+    ARCH_FEATURE="${ARCH_FEATURE}-unix"
+fi
+
+echo "${ARCH_FEATURE}"
--- a/scripts/integer-tests.sh
+++ b/scripts/integer-tests.sh
@@ -10,9 +10,6 @@ function usage() {
    echo "--multi-bit               Run multi-bit tests only: default off"
    echo "--unsigned-only           Run only unsigned integer tests, by default both signed and unsigned tests are run"
    echo "--signed-only             Run only signed integer tests, by default both signed and unsigned tests are run"
-    echo "--nightly-tests           Run integer tests configured for nightly runs (3_3 params)"
-    echo "--fast-tests              Run integer set but skip a subset of longer tests"
-    echo "--long-tests              Run only long run integer tests"
    echo "--cargo-profile           The cargo profile used to build tests"
    echo "--backend                 Backend to use with tfhe-rs"
    echo "--avx512-support          Set to ON to enable avx512"
@@ -24,7 +21,6 @@ RUST_TOOLCHAIN="+stable"
 multi_bit_argument=
 sign_argument=
 fast_tests_argument=
-long_tests_argument=
 nightly_tests_argument=
 no_big_params_argument=
 cargo_profile="release"
@@ -95,10 +91,6 @@ if [[ "${FAST_TESTS}" == TRUE ]]; then
    fast_tests_argument=--fast-tests
 fi

-if [[ "${LONG_TESTS}" == TRUE ]]; then
-    long_tests_argument=--long-tests
-fi
-
 if [[ "${NIGHTLY_TESTS}" == TRUE ]]; then
    nightly_tests_argument=--nightly-tests
 fi
@@ -112,6 +104,7 @@ if [[ "${backend}" == "gpu" ]]; then
 fi

 CURR_DIR="$(dirname "$0")"
+ARCH_FEATURE="$("${CURR_DIR}/get_arch_feature.sh")"

 # TODO autodetect/have a finer CPU count depending on memory
 num_cpu_threads="$("${CURR_DIR}"/cpu_count.sh)"
@@ -145,38 +138,32 @@ if [[ "${backend}" == "gpu" ]]; then
    fi
 fi

-filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${long_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})
+filter_expression=$(/usr/bin/python3 scripts/test_filtering.py --layer integer --backend "${backend}" ${fast_tests_argument} ${nightly_tests_argument} ${multi_bit_argument} ${sign_argument} ${no_big_params_argument})

 if [[ "${FAST_TESTS}" == "TRUE" ]]; then
    echo "Running 'fast' test set"
-elif [[ "${LONG_TESTS}" == "FALSE" ]]; then
+else
    echo "Running 'slow' test set"
 fi

-if [[ "${LONG_TESTS}" == "TRUE" ]]; then
-    echo "Running 'long run' test set"
-fi
-
 if [[ "${NIGHTLY_TESTS}" == "TRUE" ]]; then
    echo "Running 'nightly' test set"
 fi

-echo "${filter_expression}"
-
 cargo "${RUST_TOOLCHAIN}" nextest run \
    --tests \
    --cargo-profile "${cargo_profile}" \
    --package "${tfhe_package}" \
    --profile ci \
-    --features=integer,internal-keycache,zk-pok,experimental,"${avx512_feature}","${gpu_feature}" \
+    --features="${ARCH_FEATURE}",integer,internal-keycache,zk-pok,experimental,"${avx512_feature}","${gpu_feature}" \
    --test-threads "${test_threads}" \
    -E "$filter_expression"

-if [[ -z ${multi_bit_argument} && -z ${long_tests_argument} ]]; then
+if [[ -z ${multi_bit_argument} ]]; then
    cargo "${RUST_TOOLCHAIN}" test \
        --profile "${cargo_profile}" \
        --package "${tfhe_package}" \
-        --features=integer,internal-keycache,experimental,"${avx512_feature}","${gpu_feature}" \
+        --features="${ARCH_FEATURE}",integer,internal-keycache,experimental,"${avx512_feature}","${gpu_feature}" \
        --doc \
        -- --test-threads="${doctest_threads}" integer::"${gpu_feature}"
 fi
--- a/scripts/shortint-tests.sh
+++ b/scripts/shortint-tests.sh
@@ -65,6 +65,7 @@ if [[ "${FAST_TESTS}" == TRUE ]]; then
 fi

 CURR_DIR="$(dirname "$0")"
+ARCH_FEATURE="$("${CURR_DIR}/get_arch_feature.sh")"

 n_threads_small="$("${CURR_DIR}"/cpu_count.sh)"
 n_threads_big="${n_threads_small}"
@@ -93,7 +94,7 @@ if [[ "${BIG_TESTS_INSTANCE}" != TRUE ]]; then
        --cargo-profile "${cargo_profile}" \
        --package "${tfhe_package}" \
        --profile ci \
-        --features=shortint,internal-keycache,zk-pok,experimental \
+        --features="${ARCH_FEATURE}",shortint,internal-keycache,zk-pok,experimental \
        --test-threads "${n_threads_small}" \
        -E "${filter_expression_small_params}"

@@ -110,7 +111,7 @@ and not test(~smart_add_and_mul)"""
        --cargo-profile "${cargo_profile}" \
        --package "${tfhe_package}" \
        --profile ci \
-        --features=shortint,internal-keycache,zk-pok,experimental \
+        --features="${ARCH_FEATURE}",shortint,internal-keycache,zk-pok,experimental \
        --test-threads "${n_threads_big}" \
        --no-tests=warn \
        -E "${filter_expression_big_params}"
@@ -119,7 +120,7 @@ and not test(~smart_add_and_mul)"""
            cargo "${RUST_TOOLCHAIN}" test \
                --profile "${cargo_profile}" \
                --package "${tfhe_package}" \
-                --features=shortint,internal-keycache,zk-pok,experimental \
+                --features="${ARCH_FEATURE}",shortint,internal-keycache,zk-pok,experimental \
                --doc \
                -- shortint::
        fi
@@ -133,7 +134,7 @@ else
        --cargo-profile "${cargo_profile}" \
        --package "${tfhe_package}" \
        --profile ci \
-        --features=shortint,internal-keycache,experimental \
+        --features="${ARCH_FEATURE}",shortint,internal-keycache,experimental \
        --test-threads "${n_threads_big}" \
        -E "${filter_expression}"

@@ -141,7 +142,7 @@ else
        cargo "${RUST_TOOLCHAIN}" test \
            --profile "${cargo_profile}" \
            --package "${tfhe_package}" \
-            --features=shortint,internal-keycache,experimental \
+            --features="${ARCH_FEATURE}",shortint,internal-keycache,experimental \
            --doc \
            -- --test-threads="${n_threads_big}" shortint::
    fi
--- a/scripts/test_filtering.py
+++ b/scripts/test_filtering.py
@@ -26,12 +26,6 @@ parser.add_argument(
    action="store_true",
    help="Run only a small subset of test suite",
 )
-parser.add_argument(
-    "--long-tests",
-    dest="long_tests",
-    action="store_true",
-    help="Run only the long tests suite",
-)
 parser.add_argument(
    "--nightly-tests",
    dest="nightly_tests",
@@ -86,7 +80,6 @@ EXCLUDED_INTEGER_TESTS = [
    "/.*test_wopbs_bivariate_crt_wopbs_param_message_[34]_carry_[34]_ks_pbs_gaussian_2m64$/",
    "/.*test_integer_smart_mul_param_message_4_carry_4_ks_pbs_gaussian_2m64$/",
    "/.*test_integer_default_add_sequence_multi_thread_param_message_4_carry_4_ks_pbs_gaussian_2m64$/",
-    "/.*::tests_long_run::.*/",
 ]

 # skip default_div, default_rem which are covered by default_div_rem
@@ -101,61 +94,55 @@ EXCLUDED_BIG_PARAMETERS = [
    "/.*_param_message_4_carry_4_ks_pbs_gaussian_2m64$/",
 ]

+
 def filter_integer_tests(input_args):
    (multi_bit_filter, group_filter) = (
        ("_multi_bit", "_group_[0-9]") if input_args.multi_bit else ("", "")
    )
    backend_filter = ""
-    if not input_args.long_tests:
-        if input_args.backend == "gpu":
-            backend_filter = "gpu::"
-            if multi_bit_filter:
-                # For now, GPU only has specific parameters set for multi-bit
-                multi_bit_filter = "_gpu_multi_bit"
+    if input_args.backend == "gpu":
+        backend_filter = "gpu::"
+        if multi_bit_filter:
+            # For now, GPU only has specific parameters set for multi-bit
+            multi_bit_filter = "_gpu_multi_bit"

-        filter_expression = [f"test(/^integer::{backend_filter}.*/)"]
+    filter_expression = [f"test(/^integer::{backend_filter}.*/)"]

-        if input_args.multi_bit:
-            filter_expression.append("test(~_multi_bit)")
-        else:
-            filter_expression.append("not test(~_multi_bit)")
+    if input_args.multi_bit:
+        filter_expression.append("test(~_multi_bit)")
+    else:
+        filter_expression.append("not test(~_multi_bit)")

-        if input_args.signed_only:
-            filter_expression.append("test(~_signed)")
-        if input_args.unsigned_only:
-            filter_expression.append("not test(~_signed)")
+    if input_args.signed_only:
+        filter_expression.append("test(~_signed)")
+    if input_args.unsigned_only:
+        filter_expression.append("not test(~_signed)")

-        if input_args.no_big_params:
-            for pattern in EXCLUDED_BIG_PARAMETERS:
-                filter_expression.append(f"not test({pattern})")
-
-        if input_args.fast_tests and input_args.nightly_tests:
-            filter_expression.append(
-                f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_[2-3]_carry_[2-3]_.*/)"
-            )
-        elif input_args.fast_tests:
-            # Test only fast default operations with only one set of parameters
-            filter_expression.append(
-                f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_2_carry_2_.*/)"
-            )
-        elif input_args.nightly_tests:
-            # Test only fast default operations with only one set of parameters
-            # This subset would run slower than fast_tests hence the use of nightly_tests
-            filter_expression.append(
-                f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_3_carry_3_.*/)"
-            )
-        excluded_tests = (
-            EXCLUDED_INTEGER_FAST_TESTS if input_args.fast_tests else EXCLUDED_INTEGER_TESTS
-        )
-        for pattern in excluded_tests:
+    if input_args.no_big_params:
+        for pattern in EXCLUDED_BIG_PARAMETERS:
            filter_expression.append(f"not test({pattern})")

-    else:
-        if input_args.backend == "gpu":
-            filter_expression = [f"test(/^integer::gpu::server_key::radix::tests_long_run.*/)"]
-        elif input_args.backend == "cpu":
-            filter_expression = [f"test(/^integer::server_key::radix_parallel::tests_long_run.*/)"]
+    if input_args.fast_tests and input_args.nightly_tests:
+        filter_expression.append(
+            f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_[2-3]_carry_[2-3]_.*/)"
+        )
+    elif input_args.fast_tests:
+        # Test only fast default operations with only one set of parameters
+        filter_expression.append(
+            f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_2_carry_2_.*/)"
+        )
+    elif input_args.nightly_tests:
+        # Test only fast default operations with only one set of parameters
+        # This subset would run slower than fast_tests hence the use of nightly_tests
+        filter_expression.append(
+            f"test(/.*_default_.*?_param{multi_bit_filter}{group_filter}_message_3_carry_3_.*/)"
+        )

+    excluded_tests = (
+        EXCLUDED_INTEGER_FAST_TESTS if input_args.fast_tests else EXCLUDED_INTEGER_TESTS
+    )
+    for pattern in excluded_tests:
+        filter_expression.append(f"not test({pattern})")

    return " and ".join(filter_expression)

--- a/tasks/Cargo.toml
+++ b/tasks/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"

 [dependencies]
 clap = "=4.4.4"
+lazy_static = "1.4"
 log = "0.4"
 simplelog = "0.12"
 walkdir = "2.5.0"
--- a/tasks/src/check_tfhe_docs_are_tested.rs
+++ b/tasks/src/check_tfhe_docs_are_tested.rs
@@ -101,7 +101,7 @@ pub fn check_tfhe_docs_are_tested() -> Result<(), Error> {
        .into_iter()
        .filter_map(|entry| {
            let path = entry.path().canonicalize().ok()?;
-            if path.is_file() && path.extension().is_some_and(|e| e == "md") {
+            if path.is_file() && path.extension().map_or(false, |e| e == "md") {
                let file_content = std::fs::read_to_string(&path).ok()?;
                if file_content.contains("```rust") {
                    Some(path.to_path_buf())
--- a/tasks/src/main.rs
+++ b/tasks/src/main.rs
@@ -1,4 +1,5 @@
 use clap::{Arg, Command};
+use lazy_static::lazy_static;
 use log::LevelFilter;
 use simplelog::{ColorChoice, CombinedLogger, Config, TermLogger, TerminalMode};
 use std::sync::atomic::AtomicBool;
@@ -11,8 +12,9 @@ mod utils;
 // -------------------------------------------------------------------------------------------------
 // CONSTANTS
 // -------------------------------------------------------------------------------------------------
-
-static DRY_RUN: AtomicBool = AtomicBool::new(false);
+lazy_static! {
+    static ref DRY_RUN: AtomicBool = AtomicBool::new(false);
+}

 // -------------------------------------------------------------------------------------------------
 // MAIN
--- a/tfhe-csprng/Cargo.toml
+++ b/tfhe-csprng/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-csprng"
-version = "0.5.0"
+version = "0.4.1"
 edition = "2021"
 license = "BSD-3-Clause-Clear"
 description = "Cryptographically Secure PRNG used in the TFHE-rs library."
@@ -25,13 +25,29 @@ clap = "=4.4.4"

 [features]
 parallel = ["rayon"]
-software-prng = []
+seeder_x86_64_rdseed = []
+seeder_unix = []
+generator_x86_64_aesni = []
+generator_fallback = []
+generator_aarch64_aes = []
+
+x86_64 = [
+    "parallel",
+    "seeder_x86_64_rdseed",
+    "generator_x86_64_aesni",
+    "generator_fallback",
+]
+x86_64-unix = ["x86_64", "seeder_unix"]
+aarch64 = ["parallel", "generator_aarch64_aes", "generator_fallback"]
+aarch64-unix = ["aarch64", "seeder_unix"]

 [[bench]]
 name = "benchmark"
 path = "benches/benchmark.rs"
 harness = false
+required-features = ["seeder_x86_64_rdseed", "generator_x86_64_aesni"]

 [[example]]
 name = "generate"
 path = "examples/generate.rs"
+required-features = ["seeder_unix", "generator_fallback"]
--- a/tfhe-csprng/README.md
+++ b/tfhe-csprng/README.md
@@ -8,13 +8,13 @@ The implementation is based on the AES blockcipher used in CTR mode, as describe

 Two implementations are available, an accelerated one on x86_64 CPUs with the `aes` feature and the `sse2` feature, and a pure software one that can be used on other platforms.

-The crate also makes two seeders available, one needing the x86_64 instruction `rdseed` and another one based on the Unix random device `/dev/random` the latter requires the user to provide a secret.
+The crate also makes two seeders available, one needing the x86_64 feature `rdseed` and another one based on the Unix random device `/dev/random` the latter requires the user to provide a secret.

 ## Running the benchmarks

 To execute the benchmarks on an x86_64 platform:
 ```shell
-RUSTFLAGS="-Ctarget-cpu=native" cargo bench
+RUSTFLAGS="-Ctarget-cpu=native" cargo bench --features=seeder_x86_64_rdseed,generator_x86_64_aesni 
 ```

 ## License
--- a/tfhe-csprng/benches/benchmark.rs
+++ b/tfhe-csprng/benches/benchmark.rs
@@ -1,53 +1,15 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use tfhe_csprng::generators::{
-    BytesPerChild, ChildrenCount, DefaultRandomGenerator, RandomGenerator,
+    AesniRandomGenerator, BytesPerChild, ChildrenCount, RandomGenerator,
 };
-#[cfg(target_os = "macos")]
-use tfhe_csprng::seeders::AppleSecureEnclaveSeeder as ActivatedSeeder;
-#[cfg(all(
-    not(target_os = "macos"),
-    target_arch = "x86_64",
-    target_feature = "rdseed"
-))]
-use tfhe_csprng::seeders::RdseedSeeder as ActivatedSeeder;
-#[cfg(all(
-    not(target_os = "macos"),
-    not(all(target_arch = "x86_64", target_feature = "rdseed")),
-    target_family = "unix"
-))]
-use tfhe_csprng::seeders::UnixSeeder as ActivatedSeeder;
-
-use tfhe_csprng::seeders::Seeder;
+use tfhe_csprng::seeders::{RdseedSeeder, Seeder};

 // The number of bytes to generate during one benchmark iteration.
 const N_GEN: usize = 1_000_000;

-fn new_seeder() -> ActivatedSeeder {
-    #[cfg(target_os = "macos")]
-    {
-        ActivatedSeeder
-    }
-    #[cfg(all(
-        not(target_os = "macos"),
-        target_arch = "x86_64",
-        target_feature = "rdseed"
-    ))]
-    {
-        ActivatedSeeder::new()
-    }
-    #[cfg(all(
-        not(target_os = "macos"),
-        not(all(target_arch = "x86_64", target_feature = "rdseed")),
-        target_family = "unix"
-    ))]
-    {
-        ActivatedSeeder::new(0)
-    }
-}
-
 fn parent_generate(c: &mut Criterion) {
-    let mut seeder = new_seeder();
-    let mut generator = DefaultRandomGenerator::new(seeder.seed());
+    let mut seeder = RdseedSeeder;
+    let mut generator = AesniRandomGenerator::new(seeder.seed());
    c.bench_function("parent_generate", |b| {
        b.iter(|| {
            (0..N_GEN).for_each(|_| {
@@ -58,8 +20,8 @@ fn parent_generate(c: &mut Criterion) {
 }

 fn child_generate(c: &mut Criterion) {
-    let mut seeder = new_seeder();
-    let mut generator = DefaultRandomGenerator::new(seeder.seed());
+    let mut seeder = RdseedSeeder;
+    let mut generator = AesniRandomGenerator::new(seeder.seed());
    let mut generator = generator
        .try_fork(ChildrenCount(1), BytesPerChild(N_GEN * 10_000))
        .unwrap()
@@ -75,8 +37,8 @@ fn child_generate(c: &mut Criterion) {
 }

 fn fork(c: &mut Criterion) {
-    let mut seeder = new_seeder();
-    let mut generator = DefaultRandomGenerator::new(seeder.seed());
+    let mut seeder = RdseedSeeder;
+    let mut generator = AesniRandomGenerator::new(seeder.seed());
    c.bench_function("fork", |b| {
        b.iter(|| {
            black_box(
--- a/tfhe-csprng/build.rs
+++ b/tfhe-csprng/build.rs
@@ -0,0 +1,115 @@
+// To have clear error messages during compilation about why some piece of code may not be available
+// we decided to check the features compatibility with the target configuration in this script.
+
+use std::collections::HashMap;
+use std::env;
+
+// See https://doc.rust-lang.org/reference/conditional-compilation.html#target_arch for various
+// compilation configuration
+
+// Can be easily extended if needed
+pub struct FeatureRequirement {
+    pub feature_name: &'static str,
+    // target_arch requirement
+    pub feature_req_target_arch: Option<&'static str>,
+    // target_family requirement
+    pub feature_req_target_family: Option<&'static str>,
+}
+
+// We implement a version of default that is const which is not possible through the Default trait
+impl FeatureRequirement {
+    // As we cannot use cfg!(feature = "feature_name") with something else than a literal, we need
+    // a reference to the HashMap we populate with the enabled features
+    fn is_activated(&self, build_activated_features: &HashMap<&'static str, bool>) -> bool {
+        *build_activated_features.get(self.feature_name).unwrap()
+    }
+
+    // panics if the requirements are not met
+    fn check_requirements(&self) {
+        let target_arch = get_target_arch_cfg();
+        if let Some(feature_req_target_arch) = self.feature_req_target_arch {
+            if feature_req_target_arch != target_arch {
+                panic!(
+                    "Feature `{}` requires target_arch `{}`, current cfg: `{}`",
+                    self.feature_name, feature_req_target_arch, target_arch
+                )
+            }
+        }
+
+        let target_families = get_target_family_cfgs();
+        if let Some(feature_req_target_family) = self.feature_req_target_family {
+            if target_families
+                .split(',')
+                .all(|family| family != feature_req_target_family)
+            {
+                panic!(
+                    "Feature `{}` requires target_family `{}`, current cfgs: `{}`",
+                    self.feature_name, feature_req_target_family, target_families
+                )
+            }
+        }
+    }
+}
+
+// const vecs are not yet a thing so use a fixed size array (update the array size when adding
+// requirements)
+static FEATURE_REQUIREMENTS: [FeatureRequirement; 4] = [
+    FeatureRequirement {
+        feature_name: "seeder_x86_64_rdseed",
+        feature_req_target_arch: Some("x86_64"),
+        feature_req_target_family: None,
+    },
+    FeatureRequirement {
+        feature_name: "generator_x86_64_aesni",
+        feature_req_target_arch: Some("x86_64"),
+        feature_req_target_family: None,
+    },
+    FeatureRequirement {
+        feature_name: "seeder_unix",
+        feature_req_target_arch: None,
+        feature_req_target_family: Some("unix"),
+    },
+    FeatureRequirement {
+        feature_name: "generator_aarch64_aes",
+        feature_req_target_arch: Some("aarch64"),
+        feature_req_target_family: None,
+    },
+];
+
+// For a "feature_name" feature_cfg!("feature_name") expands to
+// ("feature_name", cfg!(feature = "feature_name"))
+macro_rules! feature_cfg {
+    ($feat_name:literal) => {
+        ($feat_name, cfg!(feature = $feat_name))
+    };
+}
+
+// Static HashMap would require an additional crate (phf or lazy static e.g.), so we just write a
+// function that returns the HashMap we are interested in
+fn get_feature_enabled_status() -> HashMap<&'static str, bool> {
+    HashMap::from([
+        feature_cfg!("seeder_x86_64_rdseed"),
+        feature_cfg!("generator_x86_64_aesni"),
+        feature_cfg!("seeder_unix"),
+        feature_cfg!("generator_aarch64_aes"),
+    ])
+}
+
+// See https://stackoverflow.com/a/43435335/18088947 for the inspiration of this code
+fn get_target_arch_cfg() -> String {
+    env::var("CARGO_CFG_TARGET_ARCH").expect("CARGO_CFG_TARGET_ARCH is not set")
+}
+
+fn get_target_family_cfgs() -> String {
+    env::var("CARGO_CFG_TARGET_FAMILY").expect("CARGO_CFG_TARGET_FAMILY is not set")
+}
+
+fn main() {
+    let feature_enabled_status = get_feature_enabled_status();
+
+    // This will panic if some requirements for a feature are not met
+    FEATURE_REQUIREMENTS
+        .iter()
+        .filter(|&req| FeatureRequirement::is_activated(req, &feature_enabled_status))
+        .for_each(FeatureRequirement::check_requirements);
+}
--- a/tfhe-csprng/examples/generate.rs
+++ b/tfhe-csprng/examples/generate.rs
@@ -2,29 +2,35 @@
 //! the program stdout. It can also generate a fixed number of bytes by passing a value along the
 //! optional argument `--bytes_total`. For testing purpose.
 use clap::{value_parser, Arg, Command};
-use tfhe_csprng::generators::{DefaultRandomGenerator, RandomGenerator};
+#[cfg(feature = "generator_x86_64_aesni")]
+use tfhe_csprng::generators::AesniRandomGenerator as ActivatedRandomGenerator;
+#[cfg(feature = "generator_aarch64_aes")]
+use tfhe_csprng::generators::NeonAesRandomGenerator as ActivatedRandomGenerator;
+use tfhe_csprng::generators::RandomGenerator;
+#[cfg(all(
+    not(feature = "generator_x86_64_aesni"),
+    not(feature = "generator_aarch64_aes"),
+    feature = "generator_fallback"
+))]
+use tfhe_csprng::generators::SoftwareRandomGenerator as ActivatedRandomGenerator;

 use std::io::prelude::*;
 use std::io::{stdout, StdoutLock};
 #[cfg(target_os = "macos")]
 use tfhe_csprng::seeders::AppleSecureEnclaveSeeder as ActivatedSeeder;
-#[cfg(all(
-    not(target_os = "macos"),
-    target_arch = "x86_64",
-    target_feature = "rdseed"
-))]
+#[cfg(all(not(target_os = "macos"), feature = "seeder_x86_64_rdseed"))]
 use tfhe_csprng::seeders::RdseedSeeder as ActivatedSeeder;
 use tfhe_csprng::seeders::Seeder;
 #[cfg(all(
    not(target_os = "macos"),
-    not(all(target_arch = "x86_64", target_feature = "rdseed")),
-    target_family = "unix"
+    not(feature = "seeder_x86_64_rdseed"),
+    feature = "seeder_unix"
 ))]
 use tfhe_csprng::seeders::UnixSeeder as ActivatedSeeder;

 fn write_bytes(
    buffer: &mut [u8],
-    generator: &mut DefaultRandomGenerator,
+    generator: &mut ActivatedRandomGenerator,
    stdout: &mut StdoutLock<'_>,
 ) -> std::io::Result<()> {
    buffer.iter_mut().zip(generator).for_each(|(b, g)| *b = g);
@@ -33,7 +39,7 @@ fn write_bytes(

 fn infinite_bytes_generation(
    buffer: &mut [u8],
-    generator: &mut DefaultRandomGenerator,
+    generator: &mut ActivatedRandomGenerator,
    stdout: &mut StdoutLock<'_>,
 ) {
    while write_bytes(buffer, generator, stdout).is_ok() {}
@@ -42,7 +48,7 @@ fn infinite_bytes_generation(
 fn bytes_generation(
    bytes_total: usize,
    buffer: &mut [u8],
-    generator: &mut DefaultRandomGenerator,
+    generator: &mut ActivatedRandomGenerator,
    stdout: &mut StdoutLock<'_>,
 ) {
    let quotient = bytes_total / buffer.len();
@@ -55,29 +61,6 @@ fn bytes_generation(
    write_bytes(&mut buffer[0..remaining], generator, stdout).unwrap()
 }

-fn new_seeder() -> ActivatedSeeder {
-    #[cfg(target_os = "macos")]
-    {
-        ActivatedSeeder
-    }
-    #[cfg(all(
-        not(target_os = "macos"),
-        target_arch = "x86_64",
-        target_feature = "rdseed"
-    ))]
-    {
-        ActivatedSeeder::new()
-    }
-    #[cfg(all(
-        not(target_os = "macos"),
-        not(all(target_arch = "x86_64", target_feature = "rdseed")),
-        target_family = "unix"
-    ))]
-    {
-        ActivatedSeeder::new(0)
-    }
-}
-
 pub fn main() {
    let matches = Command::new(
        "Generate a stream of random numbers, specify no flags for infinite generation",
@@ -91,11 +74,25 @@ pub fn main() {
    )
    .get_matches();

+    // Ugly hack to be able to use UnixSeeder
+    #[cfg(all(
+        not(target_os = "macos"),
+        not(feature = "seeder_x86_64_rdseed"),
+        feature = "seeder_unix"
+    ))]
+    let new_seeder = || ActivatedSeeder::new(0);
+    #[cfg(not(all(
+        not(target_os = "macos"),
+        not(feature = "seeder_x86_64_rdseed"),
+        feature = "seeder_unix"
+    )))]
+    let new_seeder = || ActivatedSeeder;
+
    let mut seeder = new_seeder();
    let seed = seeder.seed();
    // Don't print on std out
    eprintln!("seed={seed:?}");
-    let mut generator = DefaultRandomGenerator::new(seed);
+    let mut generator = ActivatedRandomGenerator::new(seed);
    let stdout = stdout();
    let mut buffer = [0u8; 16];

--- a/tfhe-csprng/src/generators/aes_ctr/mod.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/mod.rs
@@ -206,6 +206,7 @@ pub use index::*;

 /// A module containing structures to manage table indices and buffer pointers together properly.
 mod states;
+pub use states::*;

 /// A module containing an abstraction for aes block ciphers.
 mod block_cipher;
--- a/tfhe-csprng/src/generators/aes_ctr/parallel.rs
+++ b/tfhe-csprng/src/generators/aes_ctr/parallel.rs
@@ -1,5 +1,6 @@
-use crate::generators::aes_ctr::states::State;
-use crate::generators::aes_ctr::{AesBlockCipher, AesCtrGenerator, ChildrenClosure, TableIndex};
+use crate::generators::aes_ctr::{
+    AesBlockCipher, AesCtrGenerator, ChildrenClosure, State, TableIndex,
+};
 use crate::generators::{BytesPerChild, ChildrenCount, ForkError};

 /// A type alias for the parallel children iterator type.
--- a/tfhe-csprng/src/generators/default.rs
+++ b/tfhe-csprng/src/generators/default.rs
@@ -1,9 +0,0 @@
-#[cfg(all(target_arch = "x86_64", not(feature = "software-prng")))]
-pub type DefaultRandomGenerator = super::AesniRandomGenerator;
-#[cfg(all(target_arch = "aarch64", not(feature = "software-prng")))]
-pub type DefaultRandomGenerator = super::NeonAesRandomGenerator;
-#[cfg(any(
-    feature = "software-prng",
-    not(any(target_arch = "x86_64", target_arch = "aarch64"))
-))]
-pub type DefaultRandomGenerator = super::SoftwareRandomGenerator;
--- a/tfhe-csprng/src/generators/implem/aarch64/block_cipher.rs
+++ b/tfhe-csprng/src/generators/implem/aarch64/block_cipher.rs
@@ -25,8 +25,7 @@ impl AesBlockCipher for ArmAesBlockCipher {
        if !(aes_detected && neon_detected) {
            panic!(
                "The ArmAesBlockCipher requires both aes and neon aarch64 CPU features.\n\
-                aes feature available: {}\nneon feature available: {}\n\
-                Please consider enabling the SoftwareRandomGenerator with the `software-prng` feature",
+                aes feature available: {}\nneon feature available: {}\n.",
                aes_detected, neon_detected
            )
        }
--- a/tfhe-csprng/src/generators/implem/aesni/block_cipher.rs
+++ b/tfhe-csprng/src/generators/implem/aesni/block_cipher.rs
@@ -20,8 +20,7 @@ impl AesBlockCipher for AesniBlockCipher {
        if !(aes_detected && sse2_detected) {
            panic!(
                "The AesniBlockCipher requires both aes and sse2 x86 CPU features.\n\
-                aes feature available: {}\nsse2 feature available: {}\n\
-                Please consider enabling the SoftwareRandomGenerator with the `software-prng` feature",
+                aes feature available: {}\nsse2 feature available: {}\n.",
                aes_detected, sse2_detected
            )
        }
--- a/tfhe-csprng/src/generators/implem/mod.rs
+++ b/tfhe-csprng/src/generators/implem/mod.rs
@@ -1,12 +1,14 @@
-#[cfg(target_arch = "x86_64")]
+#[cfg(feature = "generator_x86_64_aesni")]
 mod aesni;
-#[cfg(target_arch = "x86_64")]
+#[cfg(feature = "generator_x86_64_aesni")]
 pub use aesni::*;

-#[cfg(target_arch = "aarch64")]
+#[cfg(feature = "generator_aarch64_aes")]
 mod aarch64;
-#[cfg(target_arch = "aarch64")]
+#[cfg(feature = "generator_aarch64_aes")]
 pub use aarch64::*;

+#[cfg(feature = "generator_fallback")]
 mod soft;
+#[cfg(feature = "generator_fallback")]
 pub use soft::*;
--- a/tfhe-csprng/src/generators/mod.rs
+++ b/tfhe-csprng/src/generators/mod.rs
@@ -123,10 +123,6 @@ mod aes_ctr;
 mod implem;
 pub use implem::*;

-pub mod default;
-/// Convenience alias for the most efficient CSPRNG implementation available.
-pub use default::DefaultRandomGenerator;
-
 #[cfg(test)]
 #[allow(unused)] // to please clippy when tests are not activated
 pub mod generator_generic_test {
--- a/tfhe-csprng/src/seeders/implem/mod.rs
+++ b/tfhe-csprng/src/seeders/implem/mod.rs
@@ -3,12 +3,12 @@ mod apple_secure_enclave_seeder;
 #[cfg(target_os = "macos")]
 pub use apple_secure_enclave_seeder::AppleSecureEnclaveSeeder;

-#[cfg(target_arch = "x86_64")]
+#[cfg(feature = "seeder_x86_64_rdseed")]
 mod rdseed;
-#[cfg(target_arch = "x86_64")]
+#[cfg(feature = "seeder_x86_64_rdseed")]
 pub use rdseed::RdseedSeeder;

-#[cfg(target_family = "unix")]
+#[cfg(feature = "seeder_unix")]
 mod unix;
-#[cfg(target_family = "unix")]
+#[cfg(feature = "seeder_unix")]
 pub use unix::UnixSeeder;
--- a/tfhe-csprng/src/seeders/implem/rdseed.rs
+++ b/tfhe-csprng/src/seeders/implem/rdseed.rs
@@ -4,23 +4,7 @@ use crate::seeders::{Seed, Seeder};
 ///
 /// The `rdseed` instruction allows to deliver seeds from a hardware source of entropy see
 /// <https://www.felixcloutier.com/x86/rdseed> .
-pub struct RdseedSeeder(());
-
-impl RdseedSeeder {
-    pub fn new() -> Self {
-        if Self::is_available() {
-            Self(())
-        } else {
-            panic!("Tried to use RdSeedSeeder but rdseed instruction is not enabled on the current machine");
-        }
-    }
-}
-
-impl Default for RdseedSeeder {
-    fn default() -> Self {
-        Self::new()
-    }
-}
+pub struct RdseedSeeder;

 impl Seeder for RdseedSeeder {
    fn seed(&mut self) -> Seed {
@@ -62,6 +46,6 @@ mod test {

    #[test]
    fn check_bounded_sequence_difference() {
-        check_seeder_fixed_sequences_different(|_| RdseedSeeder::new());
+        check_seeder_fixed_sequences_different(|_| RdseedSeeder);
    }
 }
--- a/tfhe-fft/Cargo.toml
+++ b/tfhe-fft/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-fft"
-version = "0.7.0"
+version = "0.6.0"
 edition = "2021"
 description = "tfhe-fft is a pure Rust high performance fast Fourier transform library."
 readme = "README.md"
--- a/tfhe-ntt/Cargo.toml
+++ b/tfhe-ntt/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-ntt"
-version = "0.4.0"
+version = "0.3.0"
 edition = "2021"
 description = "tfhe-ntt is a pure Rust high performance number theoretic transform library."
 readme = "README.md"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Arthur Meyre	8378ce7d44	clippy bug no span for large array on stack	2024-12-03 11:00:31 +01:00
Arthur Meyre	e0111f6dd1	chore(ci): toolchain update	2024-12-02 16:34:19 +01:00
Arthur Meyre	51fd605a0c	chore: update dependencies	2024-12-02 11:12:46 +01:00