wip: measure latencies of a list of cts

WIP: fix gpu streams and use iter_batched
chore(bench): new heuristic to define elements for throughput
2026-04-28 03:01:21 -04:00 · 2025-01-28 11:19:27 +01:00 · 2025-01-22 10:56:08 +01:00 · 2025-01-20 15:21:05 +01:00
687 changed files with 13895 additions and 25543 deletions
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -1,63 +0,0 @@
-name: Setup Cuda
-description: Setup Cuda on Hyperstack or GitHub instance
-
-inputs:
-  cuda-version:
-    description: Version of Cuda to use
-    required: true
-  gcc-version:
-    description: Version of GCC to use
-    required: true
-  cmake-version:
-    description: Version of cmake to use
-    default: 3.29.6
-  github-instance:
-    description: Instance is hosted on GitHub
-    default: 'false'
-
-runs:
-  using: "composite"
-  steps:
-    # Mandatory on hyperstack since a bootable volume is not re-usable yet.
-    - name: Install dependencies
-      shell: bash
-      run: |
-        sudo apt update
-        curl -fsSL https://apt.kitware.com/keys/kitware-archive-latest.asc | sudo gpg --dearmour -o /etc/apt/trusted.gpg.d/kitware.gpg
-        sudo chmod 644 /etc/apt/trusted.gpg.d/kitware.gpg
-        echo 'deb [signed-by=/etc/apt/trusted.gpg.d/kitware.gpg] https://apt.kitware.com/ubuntu/ jammy main' | sudo tee /etc/apt/sources.list.d/kitware.list >/dev/null
-        sudo apt update
-        sudo apt install -y cmake cmake-format libclang-dev
-
-    - name: Install CUDA
-      if: inputs.github-instance == 'true'
-      shell: bash
-      run: |
-        TOOLKIT_VERSION="$(echo ${{ inputs.cuda-version }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        sudo dpkg -i cuda-keyring_1.1-1_all.deb
-        sudo apt update
-        sudo apt -y install cuda-toolkit-${TOOLKIT_VERSION}
-
-    - name: Export CUDA variables
-      shell: bash
-      run: |
-        CUDA_PATH=/usr/local/cuda-${{ inputs.cuda-version }}
-        echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
-        echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-        echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
-        echo "CUDA_MODULE_LOADER=EAGER" >> "${GITHUB_ENV}"
-
-    # Specify the correct host compilers
-    - name: Export gcc and g++ variables
-      shell: bash
-      run: |
-        {
-          echo "CC=/usr/bin/gcc-${{ inputs.gcc-version }}";
-          echo "CXX=/usr/bin/g++-${{ inputs.gcc-version }}";
-          echo "CUDAHOSTCXX=/usr/bin/g++-${{ inputs.gcc-version }}";
-        } >> "${GITHUB_ENV}"
-
-    - name: Check device is detected
-      shell: bash
-      run: nvidia-smi
--- a/.github/actions/hyperstack_setup/action.yml
+++ b/.github/actions/hyperstack_setup/action.yml
@@ -0,0 +1,53 @@
+name: Setup Cuda
+description: Setup Cuda on Hyperstack instance
+
+inputs:
+  cuda-version:
+    description: Version of Cuda to use
+    required: true
+  gcc-version:
+    description: Version of GCC to use
+    required: true
+  cmake-version:
+    description: Version of cmake to use
+    default: 3.29.6
+
+runs:
+  using: "composite"
+  steps:
+    # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+    - name: Install dependencies
+      shell: bash
+      run: |
+        sudo apt update
+        sudo apt install -y checkinstall zlib1g-dev libssl-dev libclang-dev
+        wget https://github.com/Kitware/CMake/releases/download/v${{ inputs.cmake-version }}/cmake-${{ inputs.cmake-version }}.tar.gz
+        tar -zxvf cmake-${{ inputs.cmake-version }}.tar.gz
+        cd cmake-${{ inputs.cmake-version }}
+        ./bootstrap
+        make -j"$(nproc)"
+        sudo make install
+
+    - name: Export CUDA variables
+      shell: bash
+      run: |
+        CUDA_PATH=/usr/local/cuda-${{ inputs.cuda-version }}
+        echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
+        echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
+        echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
+        echo "CUDACXX=/usr/local/cuda-${{ inputs.cuda-version }}/bin/nvcc" >> "${GITHUB_ENV}"
+
+    # Specify the correct host compilers
+    - name: Export gcc and g++ variables
+      shell: bash
+      run: |
+        {
+          echo "CC=/usr/bin/gcc-${{ inputs.gcc-version }}";
+          echo "CXX=/usr/bin/g++-${{ inputs.gcc-version }}";
+          echo "CUDAHOSTCXX=/usr/bin/g++-${{ inputs.gcc-version }}";
+          echo "HOME=/home/ubuntu";
+        } >> "${GITHUB_ENV}"
+
+    - name: Check device is detected
+      shell: bash
+      run: nvidia-smi
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -11,10 +11,6 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -26,11 +22,10 @@ jobs:
    name: Setup instance (backward-compat-tests)
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -40,18 +35,11 @@ jobs:
          backend: aws
          profile: cpu-small

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  backward-compat-tests:
    name: Backward compatibility tests
    needs: [ setup-instance ]
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -59,7 +47,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -102,7 +90,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (backward-compat-tests)
@@ -110,9 +98,8 @@ jobs:
    needs: [ setup-instance, backward-compat-tests ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -127,4 +114,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -11,22 +11,32 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"
+  MSG_MINIMAL: event,action url,commit
+  BRANCH: ${{ github.head_ref || github.ref }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
+  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
+  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
+  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
+    paths:
+      - '.github/**'
+      - 'ci/**'
+  # General entry point for Zama's pull request as well as contribution from forks.
+  pull_request_target:
+    paths:
+      - '**'
+      - '!.github/**'
+      - '!ci/**'

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
@@ -52,19 +62,21 @@ jobs:
      user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.user_docs_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
+      ci_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ci_any_changed }}
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          ref: ${{ env.REF }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
@@ -109,9 +121,13 @@ jobs:
              - '!tfhe/src/c_api/**'
              - 'tfhe/docs/**/**.md'
              - README.md
+            ci:
+              - .github/**
+              - ci/**

      - name: Aggregate file changes
        id: aggregated-changes
+        # CI files are not included in this aggregator.
        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
@@ -126,18 +142,27 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

+  # Fail if the triggering actor is not part of Zama organization.
+  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
+  check-user-permission:
+    needs: should-run
+    if: github.event_name != 'pull_request_target' ||
+      (github.event_name == 'pull_request_target' && needs.should-run.outputs.ci_file_changed == 'false')
+    uses: ./.github/workflows/check_triggering_actor.yml
+    secrets:
+      TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
  setup-instance:
    name: Setup instance (fast-tests)
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name != 'workflow_dispatch' && needs.should-run.outputs.any_file_changed == 'true')
-    needs: should-run
+    needs: [ should-run, check-user-permission ]
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -147,13 +172,6 @@ jobs:
          backend: aws
          profile: cpu-big

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  fast-tests:
    name: Fast CPU tests
    needs: [ should-run, setup-instance ]
@@ -166,7 +184,8 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          ref: ${{ env.REF }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -265,7 +284,7 @@ jobs:
          make test_zk

      - name: Slack Notification
-        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
@@ -278,9 +297,8 @@ jobs:
    needs: [ setup-instance, fast-tests ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -14,16 +14,12 @@ env:
  # nextest
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
@@ -32,11 +28,12 @@ jobs:
  should-run:
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
        steps.changed-files.outputs.integer_any_changed }}
@@ -45,13 +42,14 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
@@ -69,15 +67,14 @@ jobs:
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -87,18 +84,11 @@ jobs:
          backend: aws
          profile: cpu-big

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  unsigned-integer-tests:
    name: Unsigned integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -106,7 +96,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: "false"
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -140,7 +130,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (unsigned-integer-tests)
@@ -148,9 +138,8 @@ jobs:
    needs: [setup-instance, unsigned-integer-tests]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -165,4 +154,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -14,16 +14,12 @@ env:
  # nextest
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  push:
    branches:
      - main
@@ -37,7 +33,7 @@ jobs:
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
        steps.changed-files.outputs.integer_any_changed }}
@@ -46,13 +42,14 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
@@ -70,15 +67,14 @@ jobs:
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -88,18 +84,11 @@ jobs:
          backend: aws
          profile: cpu-big

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  signed-integer-tests:
    name: Signed integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -107,7 +96,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: "false"
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -145,7 +134,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (signed-integer-tests)
@@ -153,9 +142,8 @@ jobs:
    needs: [setup-instance, signed-integer-tests]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -170,4 +158,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -11,10 +11,6 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_64-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -31,7 +27,7 @@ jobs:
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
@@ -67,13 +63,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
@@ -142,11 +138,10 @@ jobs:
    needs: should-run
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -156,20 +151,13 @@ jobs:
          backend: aws
          profile: cpu-big

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cpu-tests:
    name: CPU tests
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    needs: [ should-run, setup-instance ]
    concurrency:
-      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -177,7 +165,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -252,7 +240,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cpu-tests)
@@ -260,9 +248,8 @@ jobs:
    needs: [ setup-instance, cpu-tests ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -277,4 +264,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -10,10 +10,6 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,11 +23,10 @@ jobs:
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -41,18 +36,11 @@ jobs:
          backend: aws
          profile: cpu-small

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  wasm-tests:
    name: WASM tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -60,7 +48,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -121,7 +109,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (wasm-tests)
@@ -129,9 +117,8 @@ jobs:
    needs: [ setup-instance, wasm-tests ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -146,4 +133,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -51,8 +51,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -104,8 +103,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -50,8 +50,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -95,8 +94,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -52,8 +52,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -73,8 +72,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks
        run: |
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -17,7 +17,7 @@ on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  schedule:
    # Weekly benchmarks will be triggered each Friday at 9p.m.
    - cron: "0 21 * * 5"
@@ -33,13 +33,16 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ["self-hosted", "4090-desktop"]
    timeout-minutes: 1440 # 24 hours
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -60,8 +63,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run integer benchmarks
        run: |
@@ -97,7 +99,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  cuda-core-crypto-benchmarks:
    name: Cuda core crypto benchmarks  (RTX 4090)
@@ -114,8 +116,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -135,8 +136,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run core crypto benchmarks
        run: |
@@ -182,7 +182,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  remove_github_label:
    name: Remove 4090 bench label
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -53,11 +53,10 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
@@ -70,6 +69,11 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
      - name: Install rust
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
@@ -104,8 +108,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -14,7 +14,7 @@ on:
        type: string
        required: true
    secrets:
-      REPO_CHECKOUT_TOKEN:
+      FHE_ACTIONS_TOKEN:
        required: true
      SLAB_ACTION_TOKEN:
        required: true
@@ -80,11 +80,10 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
@@ -97,6 +96,11 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
      - name: Install rust
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
@@ -130,8 +134,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_gpu_integer_common.yml
+++ b/.github/workflows/benchmark_gpu_integer_common.yml
@@ -26,7 +26,7 @@ on:
        type: boolean
        default: false
    secrets:
-      REPO_CHECKOUT_TOKEN:
+      FHE_ACTIONS_TOKEN:
        required: true
      SLAB_ACTION_TOKEN:
        required: true
@@ -150,11 +150,10 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
@@ -167,6 +166,11 @@ jobs:
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"

+      - name: Set up home
+        # "Install rust" step require root user to have a HOME directory which is not set.
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
      - name: Install rust
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
@@ -206,8 +210,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -119,8 +119,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -140,8 +139,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -82,8 +82,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -103,8 +102,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -119,8 +119,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -140,8 +139,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -94,8 +94,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -94,8 +94,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -3,14 +3,6 @@ name: tfhe-zk-pok benchmarks

 on:
  workflow_dispatch:
-    inputs:
-      bench_type:
-        description: "Benchmarks type"
-        type: choice
-        default: latency
-        options:
-          - latency
-          - throughput
  push:
    branches:
      - main
@@ -28,7 +20,6 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  BENCH_TYPE: ${{ inputs.bench_type || 'latency' }}

 jobs:
  should-run:
@@ -45,8 +36,9 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            zk_pok:
              - tfhe-zk-pok/**
@@ -88,8 +80,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -109,12 +100,11 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks
        run: |
-          make BENCH_TYPE=${{ env.BENCH_TYPE }} bench_tfhe_zk_pok
+          make bench_tfhe_zk_pok

      - name: Parse results
        run: |
@@ -128,8 +118,7 @@ jobs:
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
-          --name-suffix avx512 \
-          --bench-type ${{ env.BENCH_TYPE }}
+          --name-suffix avx512

      - name: Upload parsed results artifact
        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
@@ -142,8 +131,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -28,7 +28,7 @@ jobs:
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
@@ -36,13 +36,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            wasm_bench:
              - tfhe/Cargo.toml
@@ -88,8 +88,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -177,8 +176,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -43,13 +43,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            zk_pok:
              - tfhe/Cargo.toml
@@ -130,8 +130,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -151,8 +150,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
@@ -189,8 +187,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -2,7 +2,6 @@
 name: Check commit and PR compliance
 on:
  pull_request:
-
 jobs:
  check-commit-pr:
    name: Check commit and PR
--- a/.github/workflows/check_triggering_actor.yml
+++ b/.github/workflows/check_triggering_actor.yml
@@ -0,0 +1,29 @@
+# Check if triggering actor is a collaborator and has write access
+name: Check Triggering Actor
+
+on:
+  workflow_call:
+    secrets:
+      TOKEN:
+        required: true
+
+jobs:
+  check-actor-permission:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get User Permission
+        id: check-access
+        uses: actions-cool/check-user-permission@7b90a27f92f3961b368376107661682c441f6103 # v2.3.0
+        with:
+          require: write
+          username: ${{ github.triggering_actor }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.TOKEN }}
+
+      - name: Check User Permission
+        if: steps.check-access.outputs.require-result == 'false'
+        run: |
+          echo "${{ github.triggering_actor }} does not have permissions on this repo."
+          echo "Current permission level is ${{ steps.check-access.outputs.user-permission }}"
+          echo "Job originally triggered by ${{ github.actor }}"
+          exit 1
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -6,7 +6,6 @@ on:

 env:
  ACTIONLINT_VERSION: 1.6.27
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 jobs:
  lint-check:
@@ -15,9 +14,6 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Get actionlint
        run: |
@@ -31,8 +27,7 @@ jobs:
          make lint_workflow

      - name: Ensure SHA pinned actions
-        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@25ed13d0628a1601b4b44048e63cc4328ed03633 # v3.0.22
+        uses: zgosalvez/github-actions-ensure-sha-pinned-actions@c3a2b64f69b7a1542a68f44d9edbd9ec3fc1455e # v3.0.20
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
-            ./
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -53,7 +53,7 @@ jobs:

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
          files_yaml: |
            tfhe:
@@ -83,7 +83,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@13ce06bfc6bbe3ecf90edbbf1bc32fe5978ca1d3
+        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@13ce06bfc6bbe3ecf90edbbf1bc32fe5978ca1d3
+        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -10,10 +10,6 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,11 +23,10 @@ jobs:
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -41,18 +36,11 @@ jobs:
          backend: aws
          profile: cpu-small

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  csprng-randomness-tests:
    name: CSPRNG randomness tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -60,7 +48,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -77,7 +65,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (csprng-randomness-tests)
@@ -85,9 +73,8 @@ jobs:
    needs: [ setup-instance, csprng-randomness-tests ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -102,4 +89,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an RTX 4090 machine
-name: Cuda - 4090 full tests
+name: TFHE Cuda Backend - 4090 full tests

 env:
  CARGO_TERM_COLOR: always
@@ -11,7 +11,6 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -29,7 +28,7 @@ jobs:
      contains(github.event.label.name, '4090_test') ||
      (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ["self-hosted", "4090-desktop"]

@@ -38,7 +37,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -81,4 +80,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: Cuda - Fast tests on H100
+name: TFHE Cuda Backend - Fast tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -12,22 +12,18 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -35,13 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -68,11 +64,10 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -82,20 +77,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: CUDA H100 tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -111,14 +99,17 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -151,11 +142,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -163,9 +153,8 @@ jobs:
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -180,4 +169,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Fast tests
+name: TFHE Cuda Backend - Fast tests

 env:
  CARGO_TERM_COLOR: always
@@ -12,10 +12,6 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -26,7 +22,7 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -34,13 +30,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -62,15 +58,14 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-tests)
    needs: should-run
-    if: github.event_name == 'workflow_dispatch' ||
+    if: github.event_name != 'pull_request' ||
      needs.should-run.outputs.gpu_test == 'true'
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -80,20 +75,13 @@ jobs:
          backend: hyperstack
          profile: gpu-test

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: CUDA tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -109,14 +97,17 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -149,11 +140,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -161,9 +151,8 @@ jobs:
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -178,4 +167,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: Cuda - Full tests on H100
+name: TFHE Cuda Backend - Full tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -11,6 +11,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  workflow_dispatch:
@@ -65,14 +66,18 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}

+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
@@ -105,7 +110,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -128,4 +133,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Full tests multi-GPU
+name: TFHE Cuda Backend - Full tests multi-GPU

 env:
  CARGO_TERM_COLOR: always
@@ -12,10 +12,6 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -27,7 +23,7 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -35,13 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -68,11 +64,10 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -82,20 +77,13 @@ jobs:
          backend: hyperstack
          profile: multi-gpu-test

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: CUDA multi-GPU tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -111,14 +99,17 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -154,11 +145,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests-multi-gpu)
@@ -166,9 +156,8 @@ jobs:
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -183,4 +172,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_integer_long_run_tests.yml
+++ b/.github/workflows/gpu_integer_long_run_tests.yml
@@ -59,11 +59,15 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}

+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
+
      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
        with:
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -1,5 +1,5 @@
 # Perfom tfhe-cuda-backend post-commit checks on an AWS instance
-name: Cuda - Post-commit Checks
+name: TFHE Cuda Backend - Post-commit Checks

 env:
  CARGO_TERM_COLOR: always
@@ -11,10 +11,6 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "large_ubuntu_16-22.04"

 on:
  pull_request:
@@ -24,11 +20,10 @@ jobs:
    name: Setup instance (cuda-pcc)
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -38,18 +33,11 @@ jobs:
          backend: aws
          profile: gpu-build

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-pcc:
    name: CUDA post-commit checks
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -68,17 +56,11 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

-      - name: Install CUDA
-        if: env.SECRETS_AVAILABLE == 'false'
-        shell: bash
+      - name: Set up home
        run: |
-          TOOLKIT_VERSION="$(echo ${{ matrix.cuda }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
-          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt update
-          sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -101,6 +83,7 @@ jobs:
            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
+            echo "HOME=/home/ubuntu";
          } >> "${GITHUB_ENV}"

      - name: Run fmt checks
@@ -112,12 +95,12 @@ jobs:
          make pcc_gpu

      - name: Slack Notification
-        if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}
+        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-pcc)
@@ -125,9 +108,8 @@ jobs:
    needs: [ setup-instance, cuda-pcc ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -142,4 +124,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-pcc) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-pcc) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -1,5 +1,5 @@
 # Signed integer GPU tests on an RTXA6000 VM on hyperstack with classical PBS
-name: Cuda - Signed integer tests with classical PBS
+name: TFHE Cuda Backend - Signed integer tests with classical PBS

 env:
  CARGO_TERM_COLOR: always
@@ -12,22 +12,18 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -35,13 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -68,11 +64,10 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -82,20 +77,13 @@ jobs:
          backend: hyperstack
          profile: gpu-test

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: CUDA signed integer tests with classical PBS
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -109,16 +97,16 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -137,11 +125,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-signed-classic-tests)
@@ -149,9 +136,8 @@ jobs:
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -166,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -1,5 +1,5 @@
 # Signed integer GPU tests on an H100 VM on hyperstack
-name: Cuda - Signed integer tests on H100
+name: TFHE Cuda Backend - Signed integer tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -12,23 +12,18 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
-
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -36,13 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -69,11 +64,10 @@ jobs:
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -83,20 +77,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: CUDA H100 signed integer tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -110,16 +97,16 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -138,11 +125,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -150,9 +136,8 @@ jobs:
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -167,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend signed integer on an AWS instance
-name: Cuda - Signed integer tests
+name: TFHE Cuda Backend - Signed integer tests

 env:
  CARGO_TERM_COLOR: always
@@ -14,15 +14,14 @@ env:
  FAST_TESTS: TRUE
  NIGHTLY_TESTS: FALSE
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
+    types:
+      - opened
+      - synchronize
  schedule:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"
@@ -31,7 +30,7 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -39,13 +38,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -63,6 +62,7 @@ jobs:
              - '.github/workflows/gpu_signed_integer_tests.yml'
              - scripts/integer-tests.sh
              - ci/slab.toml
+
  setup-instance:
    name: Setup instance (cuda-signed-integer-tests)
    runs-on: ubuntu-latest
@@ -71,11 +71,10 @@ jobs:
      github.event_name == 'workflow_dispatch' ||
      needs.should-run.outputs.gpu_test == 'true'
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -85,20 +84,13 @@ jobs:
          backend: hyperstack
          profile: gpu-test

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-signed-integer-tests:
    name: CUDA signed integer tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -114,14 +106,17 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -151,7 +146,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
-          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -159,9 +154,8 @@ jobs:
    needs: [ setup-instance, cuda-signed-integer-tests ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -176,4 +170,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-signed-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -1,5 +1,5 @@
 # Test unsigned integers on an RTXA6000 VM on hyperstack with the classical PBS
-name: Cuda - Unsigned integer tests with classical PBS
+name: TFHE Cuda Backend - Unsigned integer tests with classical PBS

 env:
  CARGO_TERM_COLOR: always
@@ -12,23 +12,18 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
-
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -36,13 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -64,16 +59,15 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-unsigned-classic-tests)
    needs: should-run
-    if: github.event_name == 'workflow_dispatch' ||
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -83,20 +77,13 @@ jobs:
          backend: hyperstack
          profile: gpu-test

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: CUDA unsigned integer tests with classical PBS
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -110,16 +97,16 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -138,11 +125,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-unsigned-classic-tests)
@@ -150,9 +136,8 @@ jobs:
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -167,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -1,5 +1,5 @@
 # Test unsigned integers on an H100 VM on hyperstack
-name: Cuda - Unsigned integer tests on H100
+name: TFHE Cuda Backend - Unsigned integer tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -12,22 +12,18 @@ env:
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -35,13 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -63,16 +59,15 @@ jobs:
  setup-instance:
    name: Setup instance (cuda-h100-tests)
    needs: should-run
-    if: github.event_name == 'workflow_dispatch' ||
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -82,20 +77,13 @@ jobs:
          backend: hyperstack
          profile: single-h100

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-tests-linux:
    name: CUDA H100 unsigned integer tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -109,16 +97,16 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -137,11 +125,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -149,9 +136,8 @@ jobs:
    needs: [ setup-instance, cuda-tests-linux ]
    runs-on: ubuntu-latest
    steps:
-      - name: Stop remote instance
+      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -166,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend unsigned integer on an AWS instance
-name: Cuda - Unsigned integer tests
+name: TFHE Cuda Backend - Unsigned integer tests

 env:
  CARGO_TERM_COLOR: always
@@ -13,16 +13,14 @@ env:
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
  FAST_TESTS: TRUE
  NIGHTLY_TESTS: FALSE
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
-  # Secrets will be available only to zama-ai organization members
-  SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
-  EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types:
+      - opened
+      - synchronize
  schedule:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"
@@ -31,7 +29,7 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -39,13 +37,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
-        uses: tj-actions/changed-files@dcc7a0cba800f454d79fff4b993e8c3555bcc0a8
+        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -72,11 +70,10 @@ jobs:
      needs.should-run.outputs.gpu_test == 'true'
    runs-on: ubuntu-latest
    outputs:
-      runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
+      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
-      - name: Start remote instance
-        id: start-remote-instance
-        if: env.SECRETS_AVAILABLE == 'true'
+      - name: Start instance
+        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
@@ -86,20 +83,13 @@ jobs:
          backend: hyperstack
          profile: gpu-test

-      # This instance will be spawned especially for pull-request from forked repository
-      - name: Start GitHub instance
-        id: start-github-instance
-        if: env.SECRETS_AVAILABLE == 'false'
-        run: |
-          echo "runner_group=${{ env.EXTERNAL_CONTRIBUTION_RUNNER }}" >> "$GITHUB_OUTPUT"
-
  cuda-unsigned-integer-tests:
    name: CUDA unsigned integer tests
    needs: [ should-run, setup-instance ]
    if: github.event_name != 'pull_request' ||
      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -113,16 +103,16 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Setup Hyperstack dependencies
-        uses: ./.github/actions/gpu_setup
+        uses: ./.github/actions/hyperstack_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
-          github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -149,11 +139,10 @@ jobs:
    continue-on-error: true
    steps:
      - name: Send message
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
-          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -163,7 +152,6 @@ jobs:
    steps:
      - name: Stop instance
        id: stop-instance
-        if: env.SECRETS_AVAILABLE == 'true'
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
@@ -178,4 +166,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -51,7 +51,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -3,7 +3,7 @@ name: Tests on M1 CPU
 on:
  workflow_dispatch:
  pull_request:
-    types: [ labeled ]
+    types: [labeled]
  # Have a nightly build for M1 tests
  schedule:
    # * is a special character in YAML so you have to quote this string
@@ -21,17 +21,14 @@ env:
  # We clear the cache to reduce memory pressure because of the numerous processes of cargo
  # nextest
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
-  CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}

 concurrency:
-  group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

 jobs:
  cargo-builds-m1:
-    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name == 'workflow_dispatch' ||
-      contains(github.event.label.name, 'm1_test') }}
+    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
    runs-on: ["self-hosted", "m1mac"]
    # 12 hours, default is 6 hours, hopefully this is more than enough
    timeout-minutes: 720
@@ -40,7 +37,6 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: "false"
-          token: ${{ env.CHECKOUT_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -195,8 +191,6 @@ jobs:
          SLACK_COLOR: ${{ needs.cargo-builds-m1.result }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds-m1.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds-m1.result }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-          MSG_MINIMAL: event,action url,commit
-          BRANCH: ${{ github.ref }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -46,8 +46,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe
@@ -78,17 +77,12 @@ jobs:
    name: Publish Release
    needs: [package] # for comparing hashes
    runs-on: ubuntu-latest
-    # For provenance of npmjs publish
-    permissions:
-      contents: read
-      id-token: write
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Create NPM version tag
        if: ${{ inputs.npm_latest_tag }}
        run: |
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -61,8 +61,11 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: "false"
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+
+      - name: Set up home
+        run: |
+          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -123,35 +126,7 @@ jobs:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 9
-    env:
-      CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
    steps:
-      - name: Install latest stable
-        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
-        with:
-          toolchain: stable
-
-      - name: Export CUDA variables
-        if: ${{ !cancelled() }}
-        run: |
-          echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
-          {
-            echo "CUDA_PATH=$CUDA_PATH";
-            echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
-            echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
-          } >> "${GITHUB_ENV}"
-
-      # Specify the correct host compilers
-      - name: Export gcc and g++ variables
-        if: ${{ !cancelled() }}
-        run: |
-          {
-            echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
-            echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
-            echo "HOME=/home/ubuntu";
-          } >> "${GITHUB_ENV}"
-
      - name: Publish crate.io package
        env:
          CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
@@ -186,7 +161,7 @@ jobs:
  teardown-instance:
    name: Teardown instance (publish-release)
    if: ${{ always() && needs.setup-instance.result == 'success' }}
-    needs: [setup-instance, publish-cuda-release]
+    needs: [ setup-instance, publish-cuda-release ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -30,7 +30,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-csprng
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: crate-tfhe-csprng
          path: target/package/*.crate
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -33,7 +33,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-fft
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -33,7 +33,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-ntt
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -30,7 +30,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-versionable-derive
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: crate-tfhe-versionable-derive
          path: target/package/*.crate
@@ -61,8 +61,6 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
@@ -105,13 +103,13 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
      - name: Prepare package
        run: |
          cargo package -p tfhe-versionable
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: crate-tfhe-versionable
          path: target/package/*.crate
@@ -139,7 +137,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
        with:
          fetch-depth: 0
      - name: Download artifact
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -24,7 +24,7 @@ jobs:
        - name: Prepare package
          run: |
            cargo package -p tfhe-zk-pok
-        - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
          with:
            name: crate-zk-pok
            path: target/package/*.crate
@@ -61,8 +61,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Download artifact
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -16,8 +16,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: git-sync
        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
--- a/.gitignore
+++ b/.gitignore
@@ -32,8 +32,5 @@ web-test-runner/
 node_modules/
 package-lock.json

-# Python .env
-.env
-
 # Dir used for backward compatibility test data
 tests/tfhe-backward-compat-data/
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2024 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/61
+++ b/61
@@ -20,7 +20,7 @@ BENCH_OP_FLAVOR?=DEFAULT
 BENCH_TYPE?=latency
 NODE_VERSION=22.6
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
+BACKWARD_COMPAT_DATA_BRANCH?=v0.5
 BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
@@ -282,14 +282,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p $(TFHE_SPEC)

@@ -363,18 +363,7 @@ clippy_rustdoc: install_rs_check_toolchain
 	fi && \
 	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
 		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental \
-		-p $(TFHE_SPEC)
-
-.PHONY: clippy_rustdoc_gpu # Run clippy lints on doctests enabling the boolean, shortint, integer and zk-pok
-clippy_rustdoc_gpu: install_rs_check_toolchain
-	if [[ "$(OS)" != "Linux" ]]; then \
-		echo "WARNING: skipped clippy_rustdoc_gpu, unsupported OS $(OS)"; \
-		exit 0; \
-	fi && \
-	CLIPPYFLAGS="-D warnings" RUSTDOCFLAGS="--no-run --nocapture --test-builder ./scripts/clippy_driver.sh -Z unstable-options" \
-		cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" test --doc \
-		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings,experimental,gpu \
+		--features=boolean,shortint,integer,zk-pok,pbs-stats,strings \
 		-p $(TFHE_SPEC)

 .PHONY: clippy_c_api # Run clippy lints enabling the boolean, shortint and the C API
@@ -405,10 +394,10 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
@@ -967,10 +956,6 @@ check_intra_md_links: install_mlc
 check_md_links: install_mlc
 	mlc --match-file-extension tfhe/docs

-.PHONY: check_parameter_export_ok # Checks exported "current" shortint parameter module is correct
-check_parameter_export_ok:
-	python3 ./scripts/check_current_param_export.py
-
 .PHONY: check_compile_tests # Build tests in debug without running them
 check_compile_tests: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
@@ -1071,35 +1056,35 @@ bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -1107,7 +1092,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1115,7 +1100,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1123,7 +1108,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1131,14 +1116,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
-	--features=integer,internal-keycache,zk-pok,nightly-avx512 \
+	--features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \
 	-p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
@@ -1296,7 +1281,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain

 .PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
 write_params_to_file: install_rs_check_toolchain
-	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
 	--example write_params_to_file --features=boolean,shortint,internal-keycache

 .PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
@@ -1328,17 +1313,15 @@ sha256_bool: install_rs_check_toolchain
 	--example sha256_bool --features=boolean

 .PHONY: pcc # pcc stands for pre commit checks (except GPU)
-pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested check_intra_md_links clippy_all check_compile_tests test_tfhe_lints \
-tfhe_lints
+pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested check_intra_md_links \
+clippy_all check_compile_tests test_tfhe_lints tfhe_lints

 .PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
-pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
-clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
+pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu check_rust_bindings_did_not_change

 .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
-fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
-check_md_docs_are_tested clippy_fast check_compile_tests
+fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested clippy_fast \
+check_compile_tests

 .PHONY: conformance # Automatically fix problems that can be fixed
 conformance: fix_newline fmt fmt_js
--- a/apps/trivium/README.md
+++ b/apps/trivium/README.md
@@ -18,102 +18,102 @@ use tfhe::prelude::*;
 use tfhe_trivium::TriviumStream;

 fn get_hexadecimal_string_from_lsb_first_stream(a: Vec<bool>) -> String {
-    assert!(a.len() % 8 == 0);
-    let mut hexadecimal: String = "".to_string();
-    for test in a.chunks(8) {
-        // Encoding is bytes in LSB order
-        match test[4..8] {
-            [false, false, false, false] => hexadecimal.push('0'),
-            [true, false, false, false] => hexadecimal.push('1'),
-            [false, true, false, false] => hexadecimal.push('2'),
-            [true, true, false, false] => hexadecimal.push('3'),
+	assert!(a.len() % 8 == 0);
+	let mut hexadecimal: String = "".to_string();
+	for test in a.chunks(8) {
+		// Encoding is bytes in LSB order
+		match test[4..8] {
+			[false, false, false, false] => hexadecimal.push('0'),
+			[true, false, false, false] => hexadecimal.push('1'),
+			[false, true, false, false] => hexadecimal.push('2'),
+			[true, true, false, false] => hexadecimal.push('3'),

-            [false, false, true, false] => hexadecimal.push('4'),
-            [true, false, true, false] => hexadecimal.push('5'),
-            [false, true, true, false] => hexadecimal.push('6'),
-            [true, true, true, false] => hexadecimal.push('7'),
+			[false, false, true, false] => hexadecimal.push('4'),
+			[true, false, true, false] => hexadecimal.push('5'),
+			[false, true, true, false] => hexadecimal.push('6'),
+			[true, true, true, false] => hexadecimal.push('7'),

-            [false, false, false, true] => hexadecimal.push('8'),
-            [true, false, false, true] => hexadecimal.push('9'),
-            [false, true, false, true] => hexadecimal.push('A'),
-            [true, true, false, true] => hexadecimal.push('B'),
+			[false, false, false, true] => hexadecimal.push('8'),
+			[true, false, false, true] => hexadecimal.push('9'),
+			[false, true, false, true] => hexadecimal.push('A'),
+			[true, true, false, true] => hexadecimal.push('B'),

-            [false, false, true, true] => hexadecimal.push('C'),
-            [true, false, true, true] => hexadecimal.push('D'),
-            [false, true, true, true] => hexadecimal.push('E'),
-            [true, true, true, true] => hexadecimal.push('F'),
-            _ => ()
-        };
-        match test[0..4] {
-            [false, false, false, false] => hexadecimal.push('0'),
-            [true, false, false, false] => hexadecimal.push('1'),
-            [false, true, false, false] => hexadecimal.push('2'),
-            [true, true, false, false] => hexadecimal.push('3'),
+			[false, false, true, true] => hexadecimal.push('C'),
+			[true, false, true, true] => hexadecimal.push('D'),
+			[false, true, true, true] => hexadecimal.push('E'),
+			[true, true, true, true] => hexadecimal.push('F'),
+			_ => ()
+		};
+		match test[0..4] {
+			[false, false, false, false] => hexadecimal.push('0'),
+			[true, false, false, false] => hexadecimal.push('1'),
+			[false, true, false, false] => hexadecimal.push('2'),
+			[true, true, false, false] => hexadecimal.push('3'),

-            [false, false, true, false] => hexadecimal.push('4'),
-            [true, false, true, false] => hexadecimal.push('5'),
-            [false, true, true, false] => hexadecimal.push('6'),
-            [true, true, true, false] => hexadecimal.push('7'),
+			[false, false, true, false] => hexadecimal.push('4'),
+			[true, false, true, false] => hexadecimal.push('5'),
+			[false, true, true, false] => hexadecimal.push('6'),
+			[true, true, true, false] => hexadecimal.push('7'),

-            [false, false, false, true] => hexadecimal.push('8'),
-            [true, false, false, true] => hexadecimal.push('9'),
-            [false, true, false, true] => hexadecimal.push('A'),
-            [true, true, false, true] => hexadecimal.push('B'),
+			[false, false, false, true] => hexadecimal.push('8'),
+			[true, false, false, true] => hexadecimal.push('9'),
+			[false, true, false, true] => hexadecimal.push('A'),
+			[true, true, false, true] => hexadecimal.push('B'),

-            [false, false, true, true] => hexadecimal.push('C'),
-            [true, false, true, true] => hexadecimal.push('D'),
-            [false, true, true, true] => hexadecimal.push('E'),
-            [true, true, true, true] => hexadecimal.push('F'),
-            _ => ()
-        };
-    }
-    return hexadecimal;
+			[false, false, true, true] => hexadecimal.push('C'),
+			[true, false, true, true] => hexadecimal.push('D'),
+			[false, true, true, true] => hexadecimal.push('E'),
+			[true, true, true, true] => hexadecimal.push('F'),
+			_ => ()
+		};
+	}
+	return hexadecimal;
 }

 fn main() {
-    let config = ConfigBuilder::default().build();
-    let (client_key, server_key) = generate_keys(config);
+	let config = ConfigBuilder::default().build();
+	let (client_key, server_key) = generate_keys(config);

-    let key_string = "0053A6F94C9FF24598EB".to_string();
-    let mut key = [false; 80];
+	let key_string = "0053A6F94C9FF24598EB".to_string();
+	let mut key = [false; 80];

-    for i in (0..key_string.len()).step_by(2) {
-        let mut val: u8 = u8::from_str_radix(&key_string[i..i+2], 16).unwrap();
-        for j in 0..8 {
-            key[8*(i>>1) + j] = val % 2 == 1;
-            val >>= 1;
-        }
-    }
+	for i in (0..key_string.len()).step_by(2) {
+		let mut val: u8 = u8::from_str_radix(&key_string[i..i+2], 16).unwrap();
+		for j in 0..8 {
+			key[8*(i>>1) + j] = val % 2 == 1;
+			val >>= 1;
+		}
+	}

-    let iv_string = "0D74DB42A91077DE45AC".to_string();
-    let mut iv = [false; 80];
+	let iv_string = "0D74DB42A91077DE45AC".to_string();
+	let mut iv = [false; 80];

-    for i in (0..iv_string.len()).step_by(2) {
-        let mut val: u8 = u8::from_str_radix(&iv_string[i..i+2], 16).unwrap();
-        for j in 0..8 {
-            iv[8*(i>>1) + j] = val % 2 == 1;
-            val >>= 1;
-        }
-    }
+	for i in (0..iv_string.len()).step_by(2) {
+		let mut val: u8 = u8::from_str_radix(&iv_string[i..i+2], 16).unwrap();
+		for j in 0..8 {
+			iv[8*(i>>1) + j] = val % 2 == 1;
+			val >>= 1;
+		}
+	}

-    let output_0_63    = "F4CD954A717F26A7D6930830C4E7CF0819F80E03F25F342C64ADC66ABA7F8A8E6EAA49F23632AE3CD41A7BD290A0132F81C6D4043B6E397D7388F3A03B5FE358".to_string();
+	let output_0_63    = "F4CD954A717F26A7D6930830C4E7CF0819F80E03F25F342C64ADC66ABA7F8A8E6EAA49F23632AE3CD41A7BD290A0132F81C6D4043B6E397D7388F3A03B5FE358".to_string();

-    let cipher_key = key.map(|x| FheBool::encrypt(x, &client_key));
-    let cipher_iv = iv.map(|x| FheBool::encrypt(x, &client_key));
+	let cipher_key = key.map(|x| FheBool::encrypt(x, &client_key));
+	let cipher_iv = iv.map(|x| FheBool::encrypt(x, &client_key));


-    let mut trivium = TriviumStream::<FheBool>::new(cipher_key, cipher_iv, &server_key);
+	let mut trivium = TriviumStream::<FheBool>::new(cipher_key, cipher_iv, &server_key);

-    let mut vec = Vec::<bool>::with_capacity(64*8);
-    while vec.len() < 64*8 {
-        let cipher_outputs = trivium.next_64();
-        for c in cipher_outputs {
-            vec.push(c.decrypt(&client_key))
-        }
-    }
+	let mut vec = Vec::<bool>::with_capacity(64*8);
+	while vec.len() < 64*8 {
+		let cipher_outputs = trivium.next_64();
+		for c in cipher_outputs {
+			vec.push(c.decrypt(&client_key))
+		}
+	}

-    let hexadecimal = get_hexadecimal_string_from_lsb_first_stream(vec);
-    assert_eq!(output_0_63, hexadecimal[0..64*2]);
+	let hexadecimal = get_hexadecimal_string_from_lsb_first_stream(vec);
+	assert_eq!(output_0_63, hexadecimal[0..64*2]);
 }
 ```

@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.

 # FHE shortint Trivium implementation

-The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
+The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64`).
 It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
 on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
 its setup a little more intricate.
@@ -137,68 +137,67 @@ its setup a little more intricate.
 Example code:
 ```rust
 use tfhe::shortint::prelude::*;
-use tfhe::shortint::parameters::v1_0::{
-    V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
 };
 use tfhe::{ConfigBuilder, generate_keys, FheUint64};
 use tfhe::prelude::*;
 use tfhe_trivium::TriviumStreamShortint;

 fn test_shortint() {
-    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+	let config = ConfigBuilder::default()
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
-    let (hl_client_key, hl_server_key) = generate_keys(config);
+	let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

-    let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+	let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);
    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

-    let key_string = "0053A6F94C9FF24598EB".to_string();
-    let mut key = [0; 80];
+	let key_string = "0053A6F94C9FF24598EB".to_string();
+	let mut key = [0; 80];

-    for i in (0..key_string.len()).step_by(2) {
-        let mut val = u64::from_str_radix(&key_string[i..i+2], 16).unwrap();
-        for j in 0..8 {
-            key[8*(i>>1) + j] = val % 2;
-            val >>= 1;
-        }
-    }
+	for i in (0..key_string.len()).step_by(2) {
+		let mut val = u64::from_str_radix(&key_string[i..i+2], 16).unwrap();
+		for j in 0..8 {
+			key[8*(i>>1) + j] = val % 2;
+			val >>= 1;
+		}
+	}

-    let iv_string = "0D74DB42A91077DE45AC".to_string();
-    let mut iv = [0; 80];
+	let iv_string = "0D74DB42A91077DE45AC".to_string();
+	let mut iv = [0; 80];

-    for i in (0..iv_string.len()).step_by(2) {
-        let mut val = u64::from_str_radix(&iv_string[i..i+2], 16).unwrap();
-        for j in 0..8 {
-            iv[8*(i>>1) + j] = val % 2;
-            val >>= 1;
-        }
-    }
-    let output_0_63    = "F4CD954A717F26A7D6930830C4E7CF0819F80E03F25F342C64ADC66ABA7F8A8E6EAA49F23632AE3CD41A7BD290A0132F81C6D4043B6E397D7388F3A03B5FE358".to_string();
+	for i in (0..iv_string.len()).step_by(2) {
+		let mut val = u64::from_str_radix(&iv_string[i..i+2], 16).unwrap();
+		for j in 0..8 {
+			iv[8*(i>>1) + j] = val % 2;
+			val >>= 1;
+		}
+	}
+	let output_0_63    = "F4CD954A717F26A7D6930830C4E7CF0819F80E03F25F342C64ADC66ABA7F8A8E6EAA49F23632AE3CD41A7BD290A0132F81C6D4043B6E397D7388F3A03B5FE358".to_string();

-    let cipher_key = key.map(|x| client_key.encrypt(x));
-    let cipher_iv = iv.map(|x| client_key.encrypt(x));
+	let cipher_key = key.map(|x| client_key.encrypt(x));
+	let cipher_iv = iv.map(|x| client_key.encrypt(x));

-    let mut ciphered_message = vec![FheUint64::try_encrypt(0u64, &hl_client_key).unwrap(); 9];
+	let mut ciphered_message = vec![FheUint64::try_encrypt(0u64, &hl_client_key).unwrap(); 9];

-    let mut trivium = TriviumStreamShortint::new(cipher_key, cipher_iv, &server_key, &ksk);
+	let mut trivium = TriviumStreamShortint::new(cipher_key, cipher_iv, &server_key, &ksk);

-    let mut vec = Vec::<u64>::with_capacity(8);
-    while vec.len() < 8 {
-        let trans_ciphered_message = trivium.trans_encrypt_64(ciphered_message.pop().unwrap(), &hl_server_key);
-        vec.push(trans_ciphered_message.decrypt(&hl_client_key));
-    }
+	let mut vec = Vec::<u64>::with_capacity(8);
+	while vec.len() < 8 {
+		let trans_ciphered_message = trivium.trans_encrypt_64(ciphered_message.pop().unwrap(), &hl_server_key);
+		vec.push(trans_ciphered_message.decrypt(&hl_client_key));
+	}

-    let hexadecimal = get_hexagonal_string_from_u64(vec);
-    assert_eq!(output_0_63, hexadecimal[0..64*2]);
+	let hexadecimal = get_hexagonal_string_from_u64(vec);
+	assert_eq!(output_0_63, hexadecimal[0..64*2]);
 }
 ```

--- a/apps/trivium/benches/kreyvium_shortint.rs
+++ b/apps/trivium/benches/kreyvium_shortint.rs
@@ -1,9 +1,8 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_0::{
-    V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +10,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};

 pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +63,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {

 pub fn kreyvium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +111,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {

 pub fn kreyvium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/benches/trivium_shortint.rs
+++ b/apps/trivium/benches/trivium_shortint.rs
@@ -1,9 +1,8 @@
 use criterion::Criterion;
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_0::{
-    V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
 };
 use tfhe::shortint::prelude::*;
 use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +10,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};

 pub fn trivium_shortint_warmup(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +63,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {

 pub fn trivium_shortint_gen(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +111,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {

 pub fn trivium_shortint_trans(c: &mut Criterion) {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/apps/trivium/src/kreyvium/test.rs
+++ b/apps/trivium/src/kreyvium/test.rs
@@ -1,9 +1,8 @@
 use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_0::{
-    V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +220,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn kreyvium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
--- a/apps/trivium/src/trivium/test.rs
+++ b/apps/trivium/src/trivium/test.rs
@@ -1,9 +1,8 @@
 use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
 use tfhe::prelude::*;
-use tfhe::shortint::parameters::v1_0::{
-    V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
-    V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
+use tfhe::shortint::parameters::{
+    V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64,
+    V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64,
 };
 use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
 // Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +356,19 @@ use tfhe::shortint::prelude::*;
 #[test]
 fn trivium_test_shortint_long() {
    let config = ConfigBuilder::default()
-        .use_custom_parameters(V1_0_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
+        .use_custom_parameters(V0_11_PARAM_MESSAGE_2_CARRY_2_PBS_KS_GAUSSIAN_2M64)
        .build();
    let (hl_client_key, hl_server_key) = generate_keys(config);
    let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
    let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();

    let (client_key, server_key): (ClientKey, ServerKey) =
-        gen_keys(V1_0_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
+        gen_keys(V0_11_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M64);

    let ksk = KeySwitchingKey::new(
        (&client_key, Some(&server_key)),
        (&underlying_ck, &underlying_sk),
-        V1_0_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
+        V0_11_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS,
    );

    let key_string = "0053A6F94C9FF24598EB".to_string();
--- a/backends/tfhe-cuda-backend/Cargo.toml
+++ b/backends/tfhe-cuda-backend/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tfhe-cuda-backend"
-version = "0.8.0"
+version = "0.7.0"
 edition = "2021"
 authors = ["Zama team"]
 license = "BSD-3-Clause-Clear"
--- a/backends/tfhe-cuda-backend/LICENSE
+++ b/backends/tfhe-cuda-backend/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause Clear License

-Copyright © 2025 ZAMA.
+Copyright © 2024 ZAMA.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -27,8 +27,6 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

-void cuda_set_device(uint32_t gpu_index);
-
 cudaEvent_t cuda_create_event(uint32_t gpu_index);

 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -112,8 +112,6 @@ template <typename Torus> struct int_decompression {

      generate_device_accumulator_with_encoding<Torus>(
          streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
-          decompression_rescale_lut->get_degree(0),
-          decompression_rescale_lut->get_max_degree(0),
          encryption_params.glwe_dimension, encryption_params.polynomial_size,
          effective_compression_message_modulus,
          effective_compression_carry_modulus,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -38,15 +38,6 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
 enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

 extern "C" {
-
-typedef struct {
-  void *ptr;
-  uint64_t *degrees;
-  uint64_t *noise_levels;
-  uint32_t num_radix_blocks;
-  uint32_t lwe_dimension;
-} CudaRadixCiphertextFFI;
-
 void scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -54,7 +45,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    bool allocate_gpu_memory);
 void scratch_cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -62,12 +53,13 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory);
-void cuda_apply_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks);
+    uint32_t num_many_lut, bool allocate_gpu_memory);
+void cuda_apply_univariate_lut_kb_64(void *const *streams,
+                                     uint32_t const *gpu_indexes,
+                                     uint32_t gpu_count, void *output_radix_lwe,
+                                     void const *input_radix_lwe,
+                                     int8_t *mem_ptr, void *const *ksks,
+                                     void *const *bsks, uint32_t num_blocks);

 void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
                                             uint32_t const *gpu_indexes,
@@ -81,15 +73,13 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    bool allocate_gpu_memory);

 void cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe_1,
-    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
-    uint32_t shift);
+    void *output_radix_lwe, void const *input_radix_lwe_1,
+    void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
+    void *const *bsks, uint32_t num_blocks, uint32_t shift);

 void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
                                            uint32_t const *gpu_indexes,
@@ -98,10 +88,9 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,

 void cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_luts,
-    uint32_t lut_stride);
+    void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks,
+    uint32_t num_luts, uint32_t lut_stride);

 void scratch_cuda_full_propagation_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -143,14 +132,15 @@ void cleanup_cuda_integer_mult(void *const *streams,

 void cuda_negate_integer_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
+    void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus);

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
-    uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus);
+    void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus);

 void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -163,8 +153,8 @@ void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(

 void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);

 void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -177,8 +167,8 @@ void scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(

 void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, uint32_t shift, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *lwe_array, uint32_t shift, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);

 void cleanup_cuda_integer_radix_logical_scalar_shift(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -199,8 +189,8 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(

 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+    void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);

 void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
                                                 uint32_t const *gpu_indexes,
@@ -243,17 +233,15 @@ void scratch_cuda_integer_radix_bitop_kb_64(

 void cuda_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t lwe_ciphertext_count);

 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
+    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);

 void cleanup_cuda_integer_bitop(void *const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -270,11 +258,9 @@ void scratch_cuda_integer_radix_cmux_kb_64(

 void cuda_cmux_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
+    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t lwe_ciphertext_count);

 void cleanup_cuda_integer_radix_cmux(void *const *streams,
                                     uint32_t const *gpu_indexes,
@@ -291,8 +277,8 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);

 void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
                                              uint32_t const *gpu_indexes,
@@ -319,16 +305,15 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
-    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry);

 void cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
-    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t requested_flag, uint32_t uses_carry);
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry);

 void cleanup_cuda_propagate_single_carry(void *const *streams,
                                         uint32_t const *gpu_indexes,
@@ -351,10 +336,9 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(

 void cuda_integer_overflowing_sub_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
-    CudaRadixCiphertextFFI *overflow_block,
-    const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t compute_overflow,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
    uint32_t uses_input_borrow);

 void cleanup_cuda_integer_overflowing_sub(void *const *streams,
@@ -427,13 +411,12 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    bool allocate_gpu_memory);

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks);
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift);

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -441,8 +424,9 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
                                            uint32_t const *gpu_indexes,
-                                            uint32_t gpu_count,
-                                            CudaRadixCiphertextFFI *lwe_array);
+                                            uint32_t gpu_count, void *lwe_array,
+                                            uint32_t num_blocks,
+                                            uint32_t lwe_size);

 void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -455,8 +439,8 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
-    void *const *bsks, void *const *ksks);
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);

 void cleanup_cuda_integer_abs_inplace(void *const *streams,
                                      uint32_t const *gpu_indexes,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
@@ -1,8 +0,0 @@
-#ifndef CUDA_RADIX_CIPHERTEXT_H
-#define CUDA_RADIX_CIPHERTEXT_H
-
-void release_radix_ciphertext(cudaStream_t const stream,
-                              uint32_t const gpu_index,
-                              CudaRadixCiphertextFFI *data);
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -1,7 +1,6 @@
 #ifndef CUDA_LINALG_H_
 #define CUDA_LINALG_H_

-#include "integer/integer.h"
 #include <stdint.h>

 extern "C" {
@@ -14,14 +13,17 @@ void cuda_negate_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
-                                       CudaRadixCiphertextFFI *output,
-                                       CudaRadixCiphertextFFI const *input_1,
-                                       CudaRadixCiphertextFFI const *input_2);
-void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
-                                       CudaRadixCiphertextFFI *output,
-                                       CudaRadixCiphertextFFI const *input_1,
-                                       CudaRadixCiphertextFFI const *input_2);
+void cuda_add_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -5,12 +5,12 @@

 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory);
+    uint32_t polynomial_size);

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory);
+    uint32_t level_count);

 #if CUDA_ARCH >= 900
 template <typename Torus>
@@ -114,8 +114,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
-    cuda_set_device(gpu_index);
-
    this->pbs_variant = pbs_variant;
    this->lwe_chunk_size = lwe_chunk_size;
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -61,7 +61,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {

 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory);
+    uint32_t polynomial_size);

 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

@@ -77,10 +77,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
             bool allocate_gpu_memory) {
-    cuda_set_device(gpu_index);
+
    this->pbs_variant = pbs_variant;

-    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+    auto max_shared_memory = cuda_get_max_shared_memory(0);

    if (allocate_gpu_memory) {
      switch (pbs_variant) {
@@ -157,7 +157,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

        bool supports_dsm =
            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                Torus>(polynomial_size, max_shared_memory);
+                Torus>(polynomial_size);

        uint64_t full_sm =
            get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
@@ -218,7 +218,8 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_cg(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
  uint64_t partial_sm =
@@ -244,8 +245,7 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   int max_shared_memory);
+                                                   uint32_t num_samples);

 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -8,7 +8,7 @@ extern "C" {

 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, int max_shared_memory);
+    uint32_t num_samples);

 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -11,7 +11,7 @@ void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
                                               uint32_t gpu_index, T *dest,
                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
 }
@@ -21,7 +21,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
                                               uint32_t gpu_index, T *dest,
                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }
@@ -55,7 +55,7 @@ __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                  Torus const *glwe_array_in,
                                  uint32_t const *nth_array, uint32_t num_nths,
                                  uint32_t glwe_dimension) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  dim3 grid(num_nths);
  dim3 thds(params::degree / params::opt);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -261,7 +261,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(

  // Optimization of packing keyswitch when packing many LWEs

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  check_cuda_error(cudaGetLastError());

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -57,7 +57,7 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
  if (gpu_count != 1)
    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
          "supported yet.")
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  int shared_memory_size = sizeof(double) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -45,19 +45,19 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
          const Torus *__restrict__ lwe_input_indexes,
          const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
-  const int tid = threadIdx.x + blockIdx.y * blockDim.x;
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;

  extern __shared__ int8_t sharedmem[];
  Torus *lwe_acc_out = (Torus *)sharedmem;
  auto block_lwe_array_out = get_chunk(
-      lwe_array_out, lwe_output_indexes[blockIdx.x], lwe_dimension_out + 1);
+      lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1);

  if (tid <= lwe_dimension_out) {

    Torus local_lwe_out = 0;
    auto block_lwe_array_in = get_chunk(
-        lwe_array_in, lwe_input_indexes[blockIdx.x], lwe_dimension_in + 1);
+        lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1);

    if (tid == lwe_dimension_out && threadIdx.y == 0) {
      local_lwe_out = block_lwe_array_in[lwe_dimension_in];
@@ -105,22 +105,16 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  constexpr int num_threads_y = 32;
-  int num_blocks_per_sample, num_threads_x;
+  int num_blocks, num_threads_x;

  getNumBlocksAndThreads2D(lwe_dimension_out + 1, 512, num_threads_y,
-                           num_blocks_per_sample, num_threads_x);
+                           num_blocks, num_threads_x);

  int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
-  if (num_blocks_per_sample > 65536)
-    PANIC("Cuda error (Keyswith): number of blocks per sample is too large");
-
-  // In multiplication of large integers (512, 1024, 2048), the number of
-  // samples can be larger than 65536, so we need to set it in the first
-  // dimension of the grid
-  dim3 grid(num_samples, num_blocks_per_sample, 1);
+  dim3 grid(num_blocks, num_samples, 1);
  dim3 threads(num_threads_x, num_threads_y, 1);

  keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
@@ -166,7 +160,7 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t num_lwes, bool allocate_gpu_memory) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -110,7 +110,7 @@ template <typename Torus>
 __host__ void host_modulus_switch_inplace(cudaStream_t stream,
                                          uint32_t gpu_index, Torus *array,
                                          int size, uint32_t log_modulus) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  int num_threads = 0, num_blocks = 0;
  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -2,12 +2,8 @@
 #include <cstdint>
 #include <cuda_runtime.h>

-void cuda_set_device(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-}
-
 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  cudaEvent_t event;
  check_cuda_error(cudaEventCreate(&event));
  return event;
@@ -15,24 +11,24 @@ cudaEvent_t cuda_create_event(uint32_t gpu_index) {

 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
                       uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaEventRecord(event, stream));
 }

 void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
                            uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
 }

 void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaEventDestroy(event));
 }

 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  cudaStream_t stream;
  check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return stream;
@@ -40,22 +36,15 @@ cudaStream_t cuda_create_stream(uint32_t gpu_index) {

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaStreamDestroy(stream));
 }

 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaStreamSynchronize(stream));
 }

-void synchronize_streams(cudaStream_t const *streams,
-                         uint32_t const *gpu_indexes, uint32_t gpu_count) {
-  for (uint i = 0; i < gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-  }
-}
-
 // Determine if a CUDA device is available at runtime
 uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }

@@ -63,7 +52,7 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  void *ptr;
  check_cuda_error(cudaMalloc((void **)&ptr, size));

@@ -74,7 +63,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                        uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  void *ptr;

 #ifndef CUDART_VERSION
@@ -97,7 +86,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,

 /// Check that allocation is valid
 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  size_t total_mem, free_mem;
  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
  if (size > free_mem) {
@@ -145,7 +134,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
  }

-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }
@@ -165,7 +154,7 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(
        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
@@ -190,7 +179,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
  } else {
@@ -201,7 +190,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,

 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaDeviceSynchronize());
 }

@@ -214,7 +203,7 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
  }
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

@@ -234,7 +223,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    if (attr.type != cudaMemoryTypeDevice) {
      PANIC("Cuda error: invalid dest device pointer in cuda set value.")
    }
-    cuda_set_device(gpu_index);
+    check_cuda_error(cudaSetDevice(gpu_index));
    int block_size = 256;
    int num_blocks = (n + block_size - 1) / block_size;

@@ -264,7 +253,7 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
  }

-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }
@@ -278,14 +267,14 @@ int cuda_get_number_of_gpus() {

 /// Drop a cuda array
 void cuda_drop(void *ptr, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaFree(ptr));
 }

 /// Drop a cuda array asynchronously, if supported on the device
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -22,14 +22,15 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
-    void *const *bsks, void *const *ksks) {
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks) {

  auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

  host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
-                                gpu_count, ct, bsks, (uint64_t **)(ksks), mem,
-                                is_signed);
+                                gpu_count, static_cast<uint64_t *>(ct), bsks,
+                                (uint64_t **)(ksks), mem, is_signed,
+                                num_blocks);
 }

 void cleanup_cuda_integer_abs_inplace(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -2,12 +2,15 @@
 #define TFHE_RS_ABS_CUH

 #include "crypto/keyswitch.cuh"
+#include "device.h"
 #include "integer/bitwise_ops.cuh"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
+#include "integer/integer_utilities.h"
 #include "integer/negation.cuh"
 #include "integer/scalar_shifts.cuh"
-#include "radix_ciphertext.cuh"
+#include "linear_algebra.h"
+#include "pbs/programmable_bootstrap.h"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
@@ -29,15 +32,16 @@ __host__ void scratch_cuda_integer_abs_kb(
 }

 template <typename Torus>
-__host__ void legacy_host_integer_abs_kb_async(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
-    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
+__host__ void
+host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                    uint32_t gpu_count, Torus *ct, void *const *bsks,
+                    uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
+                    bool is_signed, uint32_t num_blocks) {
  if (!is_signed)
    return;

  auto radix_params = mem_ptr->params;
-  auto mask = (Torus *)(mem_ptr->mask->ptr);
+  auto mask = mem_ptr->mask;

  auto big_lwe_dimension = radix_params.big_lwe_dimension;
  auto big_lwe_size = big_lwe_dimension + 1;
@@ -48,55 +52,20 @@ __host__ void legacy_host_integer_abs_kb_async(
  cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
                               streams[0], gpu_indexes[0]);

-  legacy_host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+  host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                              radix_params.big_lwe_dimension, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
+                       radix_params.big_lwe_dimension, num_blocks);

  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
-  legacy_host_propagate_single_carry<Torus>(
+  host_propagate_single_carry<Torus>(
      streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
      bsks, ksks, num_blocks, requested_flag, uses_carry);

-  // legacy bitop
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, ct, mask, ct, bsks, ksks, num_blocks,
-      mem_ptr->bitxor_mem->lut, mem_ptr->bitxor_mem->params.message_modulus);
-}
-
-template <typename Torus>
-__host__ void
-host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
-                    void *const *bsks, uint64_t *const *ksks,
-                    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
-  if (!is_signed)
-    return;
-
-  auto mask = mem_ptr->mask;
-
-  uint32_t num_bits_in_ciphertext =
-      (31 - __builtin_clz(mem_ptr->params.message_modulus)) *
-      ct->num_radix_blocks;
-
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mask, ct);
-
-  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
-      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
-  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                       ct->num_radix_blocks);
-
-  uint32_t requested_flag = outputFlag::FLAG_NONE;
-  uint32_t uses_carry = 0;
-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
-                                     nullptr, nullptr, mem_ptr->scp_mem, bsks,
-                                     ksks, requested_flag, uses_carry);
-
-  host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
-                                     ct, mem_ptr->bitxor_mem, bsks, ksks);
+  host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
+                              mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
 }

 #endif // TFHE_RS_ABS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -22,15 +22,17 @@ void scratch_cuda_integer_radix_bitop_kb_64(

 void cuda_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_array_1, lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_1),
+      static_cast<const uint64_t *>(lwe_array_2),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      lwe_ciphertext_count);
 }

 void cleanup_cuda_integer_bitop(void *const *streams,
@@ -41,50 +43,3 @@ void cleanup_cuda_integer_bitop(void *const *streams,
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
-
-void update_degrees_after_bitand(uint64_t *output_degrees,
-                                 uint64_t *lwe_array_1_degrees,
-                                 uint64_t *lwe_array_2_degrees,
-                                 uint32_t num_radix_blocks) {
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    output_degrees[i] =
-        std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-  }
-}
-
-void update_degrees_after_bitor(uint64_t *output_degrees,
-                                uint64_t *lwe_array_1_degrees,
-                                uint64_t *lwe_array_2_degrees,
-                                uint32_t num_radix_blocks) {
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    auto max = std::max(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto min = std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto result = max;
-
-    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
-        result = max | j;
-      }
-    }
-    output_degrees[i] = result;
-  }
-}
-
-void update_degrees_after_bitxor(uint64_t *output_degrees,
-                                 uint64_t *lwe_array_1_degrees,
-                                 uint64_t *lwe_array_2_degrees,
-                                 uint32_t num_radix_blocks) {
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    auto max = std::max(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto min = std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto result = max;
-
-    // Try every possibility to find the worst case
-    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
-        result = max ^ j;
-      }
-    }
-    output_degrees[i] = result;
-  }
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -14,34 +14,15 @@
 template <typename Torus>
 __host__ void host_integer_radix_bitop_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks) {
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
+    Torus const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
+    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {

  auto lut = mem_ptr->lut;
-  uint64_t degrees[lwe_array_1->num_radix_blocks];
-  if (mem_ptr->op == BITOP_TYPE::BITAND) {
-    update_degrees_after_bitand(degrees, lwe_array_1->degrees,
-                                lwe_array_2->degrees,
-                                lwe_array_1->num_radix_blocks);
-  } else if (mem_ptr->op == BITOP_TYPE::BITOR) {
-    update_degrees_after_bitor(degrees, lwe_array_1->degrees,
-                               lwe_array_2->degrees,
-                               lwe_array_1->num_radix_blocks);
-  } else if (mem_ptr->op == BITXOR) {
-    update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
-                                lwe_array_2->degrees,
-                                lwe_array_1->num_radix_blocks);
-  }

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
-      bsks, ksks, lut, lwe_array_out->num_radix_blocks,
-      lut->params.message_modulus);
-
-  memcpy(lwe_array_out->degrees, degrees,
-         lwe_array_out->num_radix_blocks * sizeof(uint64_t));
+      bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -25,16 +25,19 @@ void scratch_cuda_integer_radix_cmux_kb_64(

 void cuda_cmux_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
+    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t lwe_ciphertext_count) {

  host_integer_radix_cmux_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_condition, lwe_array_true, lwe_array_false,
-      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_condition),
+      static_cast<const uint64_t *>(lwe_array_true),
+      static_cast<const uint64_t *>(lwe_array_false),
+      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+
+      lwe_ciphertext_count);
 }

 void cleanup_cuda_integer_radix_cmux(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -2,7 +2,6 @@
 #define CUDA_INTEGER_CMUX_CUH

 #include "integer.cuh"
-#include "radix_ciphertext.cuh"

 template <typename Torus>
 __host__ void zero_out_if(cudaStream_t const *streams,
@@ -12,25 +11,25 @@ __host__ void zero_out_if(cudaStream_t const *streams,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
                          int_radix_lut<Torus> *predicate, void *const *bsks,
                          Torus *const *ksks, uint32_t num_radix_blocks) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
  // second operand is not an array
  auto tmp_lwe_array_input = mem_ptr->tmp;
-  host_pack_bivariate_blocks_with_single_block<Torus>(
+  pack_bivariate_blocks_with_single_block<Torus>(
      streams, gpu_indexes, gpu_count, tmp_lwe_array_input,
      predicate->lwe_indexes_in, lwe_array_input, lwe_condition,
      predicate->lwe_indexes_in, params.big_lwe_dimension,
      params.message_modulus, num_radix_blocks);

-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
      ksks, num_radix_blocks, predicate);
 }

 template <typename Torus>
-__host__ void legacy_host_integer_radix_cmux_kb(
+__host__ void host_integer_radix_cmux_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
    Torus const *lwe_array_true, Torus const *lwe_array_false,
@@ -40,88 +39,34 @@ __host__ void legacy_host_integer_radix_cmux_kb(
  auto params = mem_ptr->params;
  Torus lwe_size = params.big_lwe_dimension + 1;
  Torus radix_lwe_size = lwe_size * num_radix_blocks;
-  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in->ptr, lwe_array_true,
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in, lwe_array_true,
                               radix_lwe_size * sizeof(Torus), streams[0],
                               gpu_indexes[0]);
-  cuda_memcpy_async_gpu_to_gpu(
-      (Torus *)(mem_ptr->buffer_in->ptr) + radix_lwe_size, lwe_array_false,
-      radix_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in + radix_lwe_size,
+                               lwe_array_false, radix_lwe_size * sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    cuda_memcpy_async_gpu_to_gpu(
-        (Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size, lwe_condition,
-        lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
-  }
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr),
-      (Torus *)(mem_ptr->buffer_in->ptr),
-      (Torus *)(mem_ptr->condition_array->ptr), bsks, ksks,
-      2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus);
-
-  // If the condition was true, true_ct will have kept its value and false_ct
-  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
-  // have kept its value
-  auto mem_true = (Torus *)(mem_ptr->buffer_out->ptr);
-  auto ptr = (Torus *)mem_ptr->buffer_out->ptr;
-  auto mem_false = &ptr[radix_lwe_size];
-  auto added_cts = mem_true;
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
-                              mem_false, params.big_lwe_dimension,
-                              num_radix_blocks);
-
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
-      num_radix_blocks, mem_ptr->message_extract_lut);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_cmux_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false,
-    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
-
-  if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  if (lwe_array_out->num_radix_blocks != lwe_array_false->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-
-  auto num_radix_blocks = lwe_array_out->num_radix_blocks;
-  auto params = mem_ptr->params;
-  Torus lwe_size = params.big_lwe_dimension + 1;
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams[0], gpu_indexes[0], mem_ptr->buffer_in, 0, num_radix_blocks,
-      lwe_array_true, 0, num_radix_blocks);
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams[0], gpu_indexes[0], mem_ptr->buffer_in, num_radix_blocks,
-      2 * num_radix_blocks, lwe_array_false, 0, num_radix_blocks);
-  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                             mem_ptr->condition_array, i, i + 1,
-                                             lwe_condition, 0, 1);
+    cuda_memcpy_async_gpu_to_gpu(mem_ptr->condition_array + i * lwe_size,
+                                 lwe_condition, lwe_size * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
  }
  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
-      mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
-      2 * num_radix_blocks, params.message_modulus);
+      mem_ptr->condition_array, bsks, ksks, 2 * num_radix_blocks,
+      mem_ptr->predicate_lut, params.message_modulus);

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
-  CudaRadixCiphertextFFI mem_true;
-  CudaRadixCiphertextFFI mem_false;
-  as_radix_ciphertext_slice<Torus>(&mem_true, mem_ptr->buffer_out, 0,
-                                   num_radix_blocks);
-  as_radix_ciphertext_slice<Torus>(&mem_false, mem_ptr->buffer_out,
-                                   num_radix_blocks, 2 * num_radix_blocks);
-
-  host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
-                       &mem_false, num_radix_blocks);
+  auto mem_true = mem_ptr->buffer_out;
+  auto mem_false = &mem_ptr->buffer_out[radix_lwe_size];
+  auto added_cts = mem_true;
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
+                       mem_false, params.big_lwe_dimension, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
-      mem_ptr->message_extract_lut, num_radix_blocks);
+      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
+      num_radix_blocks, mem_ptr->message_extract_lut);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -38,7 +38,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
                                    uint32_t lwe_dimension,
                                    uint32_t num_radix_blocks) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -122,9 +122,7 @@ __host__ void are_all_comparisons_block_true(
        };
        generate_device_accumulator<Torus>(
            streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
-            is_max_value_lut->get_degree(1),
-            is_max_value_lut->get_max_degree(1), glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
+            glwe_dimension, polynomial_size, message_modulus, carry_modulus,
            is_equal_to_num_blocks_lut_f);

        Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus));
@@ -148,12 +146,12 @@ __host__ void are_all_comparisons_block_true(
    // Applies the LUT
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
          ksks, 1, lut);
      return;
    } else {
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
          num_chunks, lut);
    }
@@ -219,12 +217,12 @@ __host__ void is_at_least_one_comparisons_block_true(
    // Applies the LUT
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
          ksks, 1, lut);
      return;
    } else {
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
          accumulator, bsks, ksks, num_chunks, lut);
    }
@@ -305,7 +303,7 @@ __host__ void host_compare_with_zero_equality(
    }
  }

-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
      zero_comparison);
  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
@@ -324,7 +322,7 @@ __host__ void host_integer_radix_equality_check_kb(

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
      eq_buffer->operator_lut->params.message_modulus);
@@ -371,14 +369,14 @@ __host__ void compare_radix_blocks_kb(

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
      num_radix_blocks, is_non_zero_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
+  host_integer_radix_add_scalar_one_inplace<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
      num_radix_blocks, message_modulus, carry_modulus);
 }
@@ -422,7 +420,7 @@ __host__ void tree_sign_reduction(
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
                       partial_block_count, 4);

-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
        partial_block_count >> 1, inner_tree_leaf);

@@ -462,13 +460,12 @@ __host__ void tree_sign_reduction(
    f = sign_handler_f;
  }
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
-      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
+      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
      polynomial_size, message_modulus, carry_modulus, f);
  last_lut->broadcast_lut(streams, gpu_indexes, 0);

  // Last leaf
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
      last_lut);
 }
@@ -514,7 +511,7 @@ __host__ void host_integer_radix_difference_check_kb(

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
        2 * packed_num_radix_blocks, identity_lut);

@@ -552,11 +549,11 @@ __host__ void host_integer_radix_difference_check_kb(
          packed_left + packed_num_radix_blocks * big_lwe_size;
      Torus *last_right_block_before_sign_block =
          packed_right + packed_num_radix_blocks * big_lwe_size;
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
          1, identity_lut);
@@ -566,7 +563,7 @@ __host__ void host_integer_radix_difference_check_kb(
          last_left_block_before_sign_block, last_right_block_before_sign_block,
          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
-      legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -579,7 +576,7 @@ __host__ void host_integer_radix_difference_check_kb(
          streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
          lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
      // Compare the sign block separately
-      legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -623,7 +620,7 @@ __host__ void host_integer_radix_maxmin_kb(
      ksks, total_num_radix_blocks);

  // Selector
-  legacy_host_integer_radix_cmux_kb<Torus>(
+  host_integer_radix_cmux_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out,
      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -50,39 +50,31 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
  if (array_in == array_out)
    PANIC("Cuda error: Input and output must be different");

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  auto compression_params = mem_ptr->compression_params;

  auto log_modulus = mem_ptr->storage_log_modulus;
-  printf("pack num_lwes: %u\n", num_lwes);
-  printf("pack log_modulus: %u\n", log_modulus);
  // [0..num_glwes-1) GLWEs
-  auto in_len = num_glwes * compression_params.glwe_dimension *
-                compression_params.polynomial_size + num_lwes;
-  printf("pack compression_params.glwe_dimension: %u\n", compression_params.glwe_dimension);
-  printf("pack compression_params.polynomial_size: %u\n", compression_params.polynomial_size);
-  printf("pack in_len: %u\n", in_len);
-
+  auto in_len = (compression_params.glwe_dimension + 1) *
+                compression_params.polynomial_size;
  auto number_bits_to_pack = in_len * log_modulus;
-  printf("pack number_bits_to_pack: %u\n", number_bits_to_pack);
-
  auto nbits = sizeof(Torus) * 8;
-  printf("pack nbits: %lu\n", nbits);
-
  // number_bits_to_pack.div_ceil(Scalar::BITS)
  auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
-  printf("pack out_len: %u\n", out_len);

  // Last GLWE
-  printf("pack num_glwes: %u\n", num_glwes);
+  number_bits_to_pack = in_len * log_modulus;
+  auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
+
+  auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;

  int num_blocks = 0, num_threads = 0;
-  getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
+  getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);

  dim3 grid(num_blocks);
  dim3 threads(num_threads);
  pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
-                                            out_len, in_len, out_len);
+                                            num_coeffs, in_len, out_len);
  check_cuda_error(cudaGetLastError());
 }

@@ -193,46 +185,32 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  if (array_in == glwe_array_out)
    PANIC("Cuda error: Input and output must be different");

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  auto compression_params = mem_ptr->compression_params;

  auto log_modulus = mem_ptr->storage_log_modulus;

-  uint32_t body_count = mem_ptr->body_count;
-
-
-auto num_glwes = (body_count + compression_params.polynomial_size - 1) / compression_params.polynomial_size;
-printf("%u / %u\n", glwe_index, num_glwes);
-  printf("extract body_count: %u\n", body_count);
-
-  if (glwe_index == num_glwes-1)
-    body_count %= compression_params.polynomial_size;
-
+  uint32_t body_count =
+      std::min(mem_ptr->body_count, compression_params.polynomial_size);
  auto initial_out_len =
-      (compression_params.glwe_dimension+1) * compression_params.polynomial_size;
-  printf("extract compression_params.glwe_dimension: %u\n", compression_params.glwe_dimension);
-  printf("extract compression_params.polynomial_size: %u\n", compression_params.polynomial_size);
-  printf("extract initial_out_len: %u\n", initial_out_len);
-
-  auto number_bits_to_unpack = initial_out_len * log_modulus;
-  printf("extract log_modulus: %u\n", log_modulus);
-  printf("extract number_bits_to_unpack: %u\n", number_bits_to_unpack);
+      compression_params.glwe_dimension * compression_params.polynomial_size +
+      body_count;

+  auto compressed_glwe_accumulator_size =
+      (compression_params.glwe_dimension + 1) *
+      compression_params.polynomial_size;
+  auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
  auto nbits = sizeof(Torus) * 8;
-  printf("extract nbits: %lu\n", nbits);
-
+  // number_bits_to_unpack.div_ceil(Scalar::BITS)
  auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
-  printf("extract input_len: %u\n", input_len);

  // We assure the tail of the glwe is zeroed
  auto zeroed_slice = glwe_array_out + initial_out_len;
-  // cuda_memset_async(zeroed_slice, 0,
-  //                   (compression_params.polynomial_size - body_count) *
-  //                       sizeof(Torus),
-  //                   stream, gpu_index);
-  auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) * compression_params.polynomial_size;
-  cuda_memset_async(glwe_array_out, 0, glwe_ciphertext_size * sizeof(Torus), stream, gpu_index);
+  cuda_memset_async(zeroed_slice, 0,
+                    (compression_params.polynomial_size - body_count) *
+                        sizeof(Torus),
+                    stream, gpu_index);
  int num_blocks = 0, num_threads = 0;
  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
  dim3 grid(num_blocks);
@@ -257,6 +235,10 @@ __host__ void host_integer_decompress(

  auto compression_params = h_mem_ptr->compression_params;
  auto lwe_per_glwe = compression_params.polynomial_size;
+  if (indexes_array_size > lwe_per_glwe)
+    PANIC("Cuda error: too many LWEs to decompress. The number of LWEs should "
+          "be smaller than "
+          "polynomial_size.")

  auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
  if (num_radix_blocks != indexes_array_size)
@@ -287,7 +269,7 @@ __host__ void host_integer_decompress(
      glwe_vec.push_back(std::make_pair(i, extracted_glwe));
    } else {
      // Updates the index
-      ++glwe_vec.back().first;
+      glwe_vec.back().first++;
    }
  }
  // Sample extract all LWEs
@@ -297,7 +279,7 @@ __host__ void host_integer_decompress(
  uint32_t current_idx = 0;
  auto d_indexes_array_chunk = d_indexes_array;
  for (const auto &max_idx_and_glwe : glwe_vec) {
-    const uint32_t last_idx = max_idx_and_glwe.first;
+    uint32_t last_idx = max_idx_and_glwe.first;
    extracted_glwe = max_idx_and_glwe.second;

    auto num_lwes = last_idx + 1 - current_idx;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -285,7 +285,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      // Shift the mask so that we will only keep bits we should
      uint32_t shifted_mask = full_message_mask >> shift_amount;

-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
          interesting_divisor.last_block(), bsks, ksks, 1,
          mem_ptr->masking_luts_1[shifted_mask]);
@@ -314,7 +314,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      // the estimated degree of the output is < msg_modulus
      shifted_mask = shifted_mask & full_message_mask;

-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
          divisor_ms_blocks.first_block(), bsks, ksks, 1,
          mem_ptr->masking_luts_2[shifted_mask]);
@@ -339,7 +339,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      interesting_remainder1.insert(0, numerator_block_1.first_block(),
                                    streams[0], gpu_indexes[0]);

-      legacy_host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
          streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
          mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);

@@ -347,7 +347,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
                           interesting_remainder1.len - 1, streams[0],
                           gpu_indexes[0]);

-      legacy_host_radix_blocks_rotate_left<Torus>(
+      host_radix_blocks_rotate_left<Torus>(
          streams, gpu_indexes, gpu_count, interesting_remainder1.data,
          tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);

@@ -369,7 +369,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto left_shift_interesting_remainder2 = [&](cudaStream_t const *streams,
                                                 uint32_t const *gpu_indexes,
                                                 uint32_t gpu_count) {
-      legacy_host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
+      host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
          streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
          mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
    }; // left_shift_interesting_remainder2
@@ -402,7 +402,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // but in that position, interesting_remainder2 always has a 0
    auto &merged_interesting_remainder = interesting_remainder1;

-    legacy_host_addition<Torus>(
+    host_addition<Torus>(
        streams[0], gpu_indexes[0], merged_interesting_remainder.data,
        merged_interesting_remainder.data, interesting_remainder2.data,
        radix_params.big_lwe_dimension, merged_interesting_remainder.len);
@@ -437,7 +437,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      mem_ptr->overflow_sub_mem->update_lut_indexes(
          streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
          merged_interesting_remainder.len);
-      legacy_host_integer_overflowing_sub<uint64_t>(
+      host_integer_overflowing_sub<uint64_t>(
          streams, gpu_indexes, gpu_count, new_remainder.data,
          (uint64_t *)merged_interesting_remainder.data,
          interesting_divisor.data, subtraction_overflowed.data,
@@ -481,7 +481,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto create_clean_version_of_merged_remainder =
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count) {
-          legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -507,10 +507,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
-                                subtraction_overflowed.data,
-                                at_least_one_upper_block_is_non_zero.data,
-                                radix_params.big_lwe_dimension, 1);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
+                         subtraction_overflowed.data,
+                         at_least_one_upper_block_is_non_zero.data,
+                         radix_params.big_lwe_dimension, 1);

    int factor = (i) ? 3 : 2;
    int factor_lut_id = factor - 2;
@@ -521,7 +521,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto conditionally_zero_out_merged_interesting_remainder =
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count) {
-          legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data,
@@ -534,7 +534,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto conditionally_zero_out_merged_new_remainder =
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count) {
-          legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, new_remainder.data,
              new_remainder.data, overflow_sum_radix.data, bsks, ksks,
              new_remainder.len,
@@ -544,7 +544,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto set_quotient_bit = [&](cudaStream_t const *streams,
                                uint32_t const *gpu_indexes,
                                uint32_t gpu_count) {
-      legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, did_not_overflow.data,
          subtraction_overflowed.data,
          at_least_one_upper_block_is_non_zero.data, bsks, ksks, 1,
@@ -552,7 +552,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);

-      legacy_host_addition<Torus>(
+      host_addition<Torus>(
          streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
          &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
          radix_params.big_lwe_dimension, 1);
@@ -588,17 +588,17 @@ __host__ void host_unsigned_integer_div_rem_kb(

  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], remainder,
-                              remainder1.data, remainder2.data,
-                              radix_params.big_lwe_dimension, remainder1.len);
+  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                       remainder2.data, radix_params.big_lwe_dimension,
+                       remainder1.len);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, num_blocks, mem_ptr->message_extract_lut_2);
  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
@@ -636,14 +636,12 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }

-    legacy_host_integer_abs_kb_async<Torus>(
-        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
-        positive_numerator.data, bsks, ksks, int_mem_ptr->abs_mem_1, true,
-        num_blocks);
-    legacy_host_integer_abs_kb_async<Torus>(
-        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
-        positive_divisor.data, bsks, ksks, int_mem_ptr->abs_mem_2, true,
-        num_blocks);
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
+                               gpu_count, positive_numerator.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_1, true, num_blocks);
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
+                               gpu_count, positive_divisor.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_2, true, num_blocks);
    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -654,7 +652,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
        positive_numerator.data, positive_divisor.data, bsks, ksks,
        int_mem_ptr->unsigned_mem, num_blocks);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
        int_mem_ptr->sign_bits_are_different,
        &numerator[big_lwe_size * (num_blocks - 1)],
@@ -667,36 +665,36 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
    }

-    legacy_host_integer_radix_negation(
+    host_integer_radix_negation(
        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
        int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);

    uint32_t requested_flag = outputFlag::FLAG_NONE;
    uint32_t uses_carry = 0;
-    legacy_host_propagate_single_carry<Torus>(
+    host_propagate_single_carry<Torus>(
        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
        int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
        bsks, ksks, num_blocks, requested_flag, uses_carry);

-    legacy_host_integer_radix_negation(
-        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
-        int_mem_ptr->negated_remainder, remainder,
-        radix_params.big_lwe_dimension, num_blocks,
-        radix_params.message_modulus, radix_params.carry_modulus);
+    host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
+                                gpu_count, int_mem_ptr->negated_remainder,
+                                remainder, radix_params.big_lwe_dimension,
+                                num_blocks, radix_params.message_modulus,
+                                radix_params.carry_modulus);

-    legacy_host_propagate_single_carry<Torus>(
+    host_propagate_single_carry<Torus>(
        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
        int_mem_ptr->negated_remainder, nullptr, nullptr,
        int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag,
        uses_carry);

-    legacy_host_integer_radix_cmux_kb<Torus>(
+    host_integer_radix_cmux_kb<Torus>(
        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
        int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
        quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks, num_blocks);

-    legacy_host_integer_radix_cmux_kb<Torus>(
+    host_integer_radix_cmux_kb<Torus>(
        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
        &numerator[big_lwe_size * (num_blocks - 1)],
        int_mem_ptr->negated_remainder, remainder,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -106,42 +106,49 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
-    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry) {

  host_propagate_single_carry<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, carry_out,
-      carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), requested_flag, uses_carry);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      static_cast<const uint64_t *>(carry_in),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks, requested_flag, uses_carry);
 }

 void cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
-    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t requested_flag, uint32_t uses_carry) {
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry) {

  host_add_and_propagate_single_carry<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
-      carry_out, carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), requested_flag, uses_carry);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(carry_out),
+      static_cast<const uint64_t *>(carry_in),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks, requested_flag, uses_carry);
 }

 void cuda_integer_overflowing_sub_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
-    CudaRadixCiphertextFFI *overflow_block,
-    const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks, uint32_t compute_overflow,
+    void *lhs_array, const void *rhs_array, void *overflow_block,
+    const void *input_borrow, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks, uint32_t compute_overflow,
    uint32_t uses_input_borrow) {

  host_integer_overflowing_sub<uint64_t>(
-      (cudaStream_t const *)streams, gpu_indexes, gpu_count, lhs_array,
-      lhs_array, rhs_array, overflow_block, input_borrow,
+      (cudaStream_t const *)streams, gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lhs_array), static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(overflow_block),
+      static_cast<const uint64_t *>(input_borrow),
      (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
-      compute_overflow, uses_input_borrow);
+      num_blocks, compute_overflow, uses_input_borrow);
 }

 void cleanup_cuda_propagate_single_carry(void *const *streams,
@@ -177,7 +184,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -188,7 +195,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      lut_degree, allocate_gpu_memory);
+      allocate_gpu_memory);
 }

 void scratch_cuda_apply_many_univariate_lut_kb_64(
@@ -198,7 +205,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory) {
+    uint32_t num_many_lut, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -209,19 +216,22 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      num_many_lut, lut_degree, allocate_gpu_memory);
+      num_many_lut, allocate_gpu_memory);
 }

-void cuda_apply_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks) {
+void cuda_apply_univariate_lut_kb_64(void *const *streams,
+                                     uint32_t const *gpu_indexes,
+                                     uint32_t gpu_count, void *output_radix_lwe,
+                                     void const *input_radix_lwe,
+                                     int8_t *mem_ptr, void *const *ksks,
+                                     void *const *bsks, uint32_t num_blocks) {

  host_apply_univariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
-      bsks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<const uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
+      num_blocks);
 }

 void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
@@ -234,15 +244,16 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,

 void cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+    void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks,
+    uint32_t num_many_lut, uint32_t lut_stride) {

  host_apply_many_univariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
-      bsks, num_many_lut, lut_stride);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<const uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
+      num_many_lut, lut_stride);
 }

 void scratch_cuda_apply_bivariate_lut_kb_64(
@@ -252,7 +263,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -263,21 +274,22 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      lut_degree, allocate_gpu_memory);
+      allocate_gpu_memory);
 }

 void cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe_1,
-    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
-    uint32_t shift) {
+    void *output_radix_lwe, void const *input_radix_lwe_1,
+    void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
+    void *const *bsks, uint32_t num_blocks, uint32_t shift) {

  host_apply_bivariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      input_radix_lwe_1, input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
-      (uint64_t **)(ksks), bsks, num_radix_blocks, shift);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<const uint64_t *>(input_radix_lwe_1),
+      static_cast<const uint64_t *>(input_radix_lwe_2),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
+      shift);
 }

 void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
@@ -295,7 +307,7 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -306,21 +318,22 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      lut_degree, allocate_gpu_memory);
+      allocate_gpu_memory);
 }

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift) {

  int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;

  host_compute_prefix_sum_hillis_steele<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      generates_or_propagates, params, (int_radix_lut<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), num_radix_blocks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(generates_or_propagates), params,
+      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks);
 }

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
@@ -332,9 +345,11 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
                                            uint32_t const *gpu_indexes,
-                                            uint32_t gpu_count,
-                                            CudaRadixCiphertextFFI *lwe_array) {
+                                            uint32_t gpu_count, void *lwe_array,
+                                            uint32_t num_blocks,
+                                            uint32_t lwe_size) {

-  host_radix_blocks_reverse_inplace<uint64_t>((cudaStream_t *)(streams),
-                                              gpu_indexes, lwe_array);
+  host_radix_blocks_reverse_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes,
+      static_cast<uint64_t *>(lwe_array), num_blocks, lwe_size);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -228,9 +228,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
                                 streams[0], gpu_indexes[0]);
  }
  if (num_radix_in_vec == 2) {
-    legacy_host_addition<Torus>(
-        streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-        &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks[num_blocks * big_lwe_size],
+                         big_lwe_dimension, num_blocks);
    return;
  }

@@ -280,13 +280,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

  // generate accumulators
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], message_acc,
-      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
-      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-      lut_f_message);
+      streams[0], gpu_indexes[0], message_acc, glwe_dimension, polynomial_size,
+      message_modulus, carry_modulus, lut_f_message);
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
-      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
+      streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
      message_modulus, carry_modulus, lut_f_carry);
  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

@@ -297,7 +294,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
      ch_amount++;
    dim3 add_grid(ch_amount, num_blocks, 1);

-    cuda_set_device(gpu_indexes[0]);
+    cudaSetDevice(gpu_indexes[0]);
    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);

@@ -448,9 +445,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  luts_message_carry->release(streams, gpu_indexes, gpu_count);
  delete (luts_message_carry);

-  legacy_host_addition<Torus>(
-      streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-      &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks[num_blocks * big_lwe_size],
+                       big_lwe_dimension, num_blocks);
 }

 template <typename Torus, class params>
@@ -541,13 +538,13 @@ __host__ void host_integer_mult_radix_kb(
  dim3 grid(lsb_vector_block_count, 1, 1);
  dim3 thds(params::degree / params::opt, 1, 1);

-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
      vector_lsb_rhs, vector_msb_rhs, num_blocks);
  check_cuda_error(cudaGetLastError());

-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
      vector_result_sb, bsks, ksks, total_block_count, luts_array,
      luts_array->params.message_modulus);
@@ -556,7 +553,7 @@ __host__ void host_integer_mult_radix_kb(
  vector_result_msb = &block_mul_res[lsb_vector_block_count *
                                     (polynomial_size * glwe_dimension + 1)];

-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  fill_radix_from_lsb_msb<Torus, params>
      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
         streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
@@ -587,7 +584,7 @@ __host__ void host_integer_mult_radix_kb(
  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
-  legacy_host_propagate_single_carry<Torus>(
+  host_propagate_single_carry<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
      scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -2,11 +2,13 @@

 void cuda_negate_integer_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
+    void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus) {

-  host_integer_radix_negation<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
-                                        gpu_count, lwe_array_out, lwe_array_in,
-                                        message_modulus, carry_modulus);
+  host_integer_radix_negation<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -54,69 +54,12 @@ device_integer_radix_negation(Torus *output, Torus const *input,
 }

 template <typename Torus>
-__host__ void
-host_integer_radix_negation(cudaStream_t const *streams,
-                            uint32_t const *gpu_indexes, uint32_t gpu_count,
-                            CudaRadixCiphertextFFI *lwe_array_out,
-                            CudaRadixCiphertextFFI const *lwe_array_in,
-                            uint64_t message_modulus, uint64_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
-
-  if (lwe_array_out->num_radix_blocks != lwe_array_in->num_radix_blocks)
-    PANIC("Cuda error: lwe_array_in and lwe_array_out num radix blocks must be "
-          "the same")
-
-  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
-    PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
-          "the same")
-
-  auto num_radix_blocks = lwe_array_out->num_radix_blocks;
-  auto lwe_dimension = lwe_array_out->lwe_dimension;
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_negation<Torus><<<grid, thds, 0, streams[0]>>>(
-      static_cast<Torus *>(lwe_array_out->ptr),
-      static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
-      message_modulus, delta);
-  check_cuda_error(cudaGetLastError());
-
-  uint8_t zb = 0;
-  for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
-    auto input_degree = lwe_array_in->degrees[i];
-
-    if (zb != 0) {
-      input_degree += static_cast<uint64_t>(zb);
-    }
-    Torus z =
-        std::max(static_cast<Torus>(1),
-                 static_cast<Torus>(ceil(input_degree / message_modulus))) *
-        message_modulus;
-
-    lwe_array_out->degrees[i] = z - static_cast<uint64_t>(zb);
-    lwe_array_out->noise_levels[i] = lwe_array_in->noise_levels[i];
-    zb = z / message_modulus;
-  }
-}
-template <typename Torus>
-__host__ void legacy_host_integer_radix_negation(
+__host__ void host_integer_radix_negation(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, Torus *output, Torus const *input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint64_t message_modulus, uint64_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
@@ -172,7 +115,7 @@ __host__ void host_integer_overflowing_sub_kb(

 */
 template <typename Torus>
-__host__ void legacy_host_integer_overflowing_sub(
+__host__ void host_integer_overflowing_sub(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, Torus *lwe_out_array, Torus *lhs_array,
    const Torus *rhs_array, Torus *overflow_block, const Torus *input_borrow,
@@ -191,13 +134,13 @@ __host__ void legacy_host_integer_overflowing_sub(
  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;

  auto stream = (cudaStream_t *)streams;
-  legacy_host_unchecked_sub_with_correcting_term<Torus>(
+  host_unchecked_sub_with_correcting_term<Torus>(
      stream[0], gpu_indexes[0], static_cast<Torus *>(lwe_out_array),
      static_cast<Torus *>(lhs_array), static_cast<const Torus *>(rhs_array),
      radix_params.big_lwe_dimension, num_blocks, radix_params.message_modulus,
      radix_params.carry_modulus, radix_params.message_modulus - 1);

-  legacy_host_single_borrow_propagate<Torus>(
+  host_single_borrow_propagate<Torus>(
      streams, gpu_indexes, gpu_count, static_cast<Torus *>(lwe_out_array),
      static_cast<Torus *>(overflow_block),
      static_cast<const Torus *>(input_borrow),
@@ -205,47 +148,4 @@ __host__ void legacy_host_integer_overflowing_sub(
      num_blocks, num_groups, compute_overflow, uses_input_borrow);
 }

-template <typename Torus>
-__host__ void host_integer_overflowing_sub(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI *input_left,
-    const CudaRadixCiphertextFFI *input_right,
-    CudaRadixCiphertextFFI *overflow_block,
-    const CudaRadixCiphertextFFI *input_borrow,
-    int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
-    Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
-
-  if (output->num_radix_blocks != input_left->num_radix_blocks ||
-      output->num_radix_blocks != input_right->num_radix_blocks)
-    PANIC("Cuda error: lwe_array_in and output num radix blocks must be "
-          "the same")
-
-  if (output->lwe_dimension != input_left->lwe_dimension ||
-      output->lwe_dimension != input_right->lwe_dimension)
-    PANIC("Cuda error: lwe_array_in and output lwe_dimension must be "
-          "the same")
-
-  auto num_blocks = output->num_radix_blocks;
-  auto radix_params = mem_ptr->params;
-
-  // We need to recalculate the num_groups, because on the division the number
-  // of num_blocks changes
-  uint32_t block_modulus =
-      radix_params.message_modulus * radix_params.carry_modulus;
-  uint32_t num_bits_in_block = log2_int(block_modulus);
-  uint32_t grouping_size = num_bits_in_block;
-  uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
-
-  auto stream = (cudaStream_t *)streams;
-  host_unchecked_sub_with_correcting_term<Torus>(
-      stream[0], gpu_indexes[0], output, input_left, input_right, num_blocks,
-      radix_params.message_modulus, radix_params.carry_modulus);
-
-  host_single_borrow_propagate<Torus>(
-      streams, gpu_indexes, gpu_count, output, overflow_block, input_borrow,
-      (int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
-      num_groups, compute_overflow, uses_input_borrow);
-}
-
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu
@@ -1,10 +0,0 @@
-#include "radix_ciphertext.cuh"
-
-void release_radix_ciphertext(cudaStream_t const stream,
-                              uint32_t const gpu_index,
-                              CudaRadixCiphertextFFI *data) {
-  cuda_drop_async(data->ptr, stream, gpu_index);
-  free(data->degrees);
-  free(data->noise_levels);
-  cuda_synchronize_stream(stream, gpu_index);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -1,142 +0,0 @@
-#ifndef CUDA_INTEGER_RADIX_CIPHERTEXT_CUH
-#define CUDA_INTEGER_RADIX_CIPHERTEXT_CUH
-
-#include "device.h"
-#include "integer/integer.h"
-
-template <typename Torus>
-void create_zero_radix_ciphertext_async(cudaStream_t const stream,
-                                        uint32_t const gpu_index,
-                                        CudaRadixCiphertextFFI *radix,
-                                        const uint32_t num_radix_blocks,
-                                        const uint32_t lwe_dimension) {
-  radix->lwe_dimension = lwe_dimension;
-  radix->num_radix_blocks = num_radix_blocks;
-  uint32_t size = (lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
-  radix->ptr = (void *)cuda_malloc_async(size, stream, gpu_index);
-  cuda_memset_async(radix->ptr, 0, size, stream, gpu_index);
-
-  radix->degrees = (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t)));
-  radix->noise_levels =
-      (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t)));
-  if (radix->degrees == NULL || radix->noise_levels == NULL) {
-    PANIC("Cuda error: degrees / noise levels not allocated correctly")
-  }
-}
-
-// end_input_lwe_index is exclusive
-template <typename Torus>
-void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
-                               const CudaRadixCiphertextFFI *input_radix,
-                               const uint32_t start_input_lwe_index,
-                               const uint32_t end_input_lwe_index) {
-  if (input_radix->num_radix_blocks <
-      end_input_lwe_index - start_input_lwe_index)
-    PANIC("Cuda error: input radix should have more blocks than the specified "
-          "range")
-  if (start_input_lwe_index >= end_input_lwe_index)
-    PANIC("Cuda error: slice range should be non negative")
-
-  auto lwe_size = input_radix->lwe_dimension + 1;
-  output_radix->num_radix_blocks = end_input_lwe_index - start_input_lwe_index;
-  output_radix->lwe_dimension = input_radix->lwe_dimension;
-  Torus *in_ptr = (Torus *)input_radix->ptr;
-  output_radix->ptr = (void *)(in_ptr + start_input_lwe_index * lwe_size);
-  output_radix->degrees = input_radix->degrees + start_input_lwe_index;
-  output_radix->noise_levels =
-      input_radix->noise_levels + start_input_lwe_index;
-}
-
-// end_lwe_index are exclusive
-template <typename Torus>
-void copy_radix_ciphertext_slice_async(
-    cudaStream_t const stream, uint32_t const gpu_index,
-    CudaRadixCiphertextFFI *output_radix, const uint32_t output_start_lwe_index,
-    const uint32_t output_end_lwe_index,
-    const CudaRadixCiphertextFFI *input_radix,
-    const uint32_t input_start_lwe_index, const uint32_t input_end_lwe_index) {
-  if (output_radix->lwe_dimension != input_radix->lwe_dimension)
-    PANIC("Cuda error: input lwe dimension should be equal to output lwe "
-          "dimension")
-  if (output_end_lwe_index - output_start_lwe_index !=
-      input_end_lwe_index - input_start_lwe_index)
-    PANIC("Cuda error: output and input ranges should have the same size")
-  if (output_end_lwe_index - output_start_lwe_index >
-      output_radix->num_radix_blocks)
-    PANIC("Cuda error: output range should be lower or equal to output num "
-          "blocks")
-  if (input_end_lwe_index - input_start_lwe_index >
-      input_radix->num_radix_blocks)
-    PANIC(
-        "Cuda error: input range should be lower or equal to input num blocks")
-  if (output_end_lwe_index - output_start_lwe_index <= 0)
-    PANIC("Cuda error: output range should be greater than zero")
-  if (input_end_lwe_index - input_start_lwe_index <= 0)
-    PANIC("Cuda error: input range should be greater than zero")
-  if (output_end_lwe_index <= output_start_lwe_index)
-    PANIC("Cuda error: output end index should be greater or equal to output "
-          "start index")
-  if (input_end_lwe_index <= input_start_lwe_index)
-    PANIC("Cuda error: input end index should be greater or equal to input "
-          "start index")
-  if (output_start_lwe_index > output_radix->num_radix_blocks)
-    PANIC("Cuda error: output start index should be smaller than the number "
-          "of blocks")
-  if (input_start_lwe_index > input_radix->num_radix_blocks)
-    PANIC("Cuda error: input start index should be smaller than the number "
-          "of blocks")
-
-  auto lwe_size = input_radix->lwe_dimension + 1;
-  Torus *out_ptr = (Torus *)output_radix->ptr;
-  out_ptr = &out_ptr[output_start_lwe_index * lwe_size];
-  Torus *in_ptr = (Torus *)input_radix->ptr;
-  in_ptr = &in_ptr[input_start_lwe_index * lwe_size];
-  auto num_blocks = input_end_lwe_index - input_start_lwe_index;
-
-  cuda_memcpy_async_gpu_to_gpu(out_ptr, in_ptr,
-                               num_blocks * lwe_size * sizeof(Torus), stream,
-                               gpu_index);
-  for (uint i = 0; i < num_blocks; i++) {
-    output_radix->degrees[i + output_start_lwe_index] =
-        input_radix->degrees[i + input_start_lwe_index];
-    output_radix->noise_levels[i + output_start_lwe_index] =
-        input_radix->noise_levels[i + input_start_lwe_index];
-  }
-}
-
-template <typename Torus>
-void copy_radix_ciphertext_async(cudaStream_t const stream,
-                                 uint32_t const gpu_index,
-                                 CudaRadixCiphertextFFI *output_radix,
-                                 const CudaRadixCiphertextFFI *input_radix) {
-  copy_radix_ciphertext_slice_async<Torus>(
-      stream, gpu_index, output_radix, 0, output_radix->num_radix_blocks,
-      input_radix, 0, input_radix->num_radix_blocks);
-}
-
-// end_lwe_index is exclusive
-template <typename Torus>
-void set_zero_radix_ciphertext_slice_async(cudaStream_t const stream,
-                                           uint32_t const gpu_index,
-                                           CudaRadixCiphertextFFI *radix,
-                                           const uint32_t start_lwe_index,
-                                           const uint32_t end_lwe_index) {
-  if (radix->num_radix_blocks < end_lwe_index - start_lwe_index)
-    PANIC("Cuda error: input radix should have more blocks than the specified "
-          "range")
-  if (start_lwe_index >= end_lwe_index)
-    PANIC("Cuda error: slice range should be non negative")
-
-  auto lwe_size = radix->lwe_dimension + 1;
-  auto num_blocks_to_set = end_lwe_index - start_lwe_index;
-  auto lwe_array_out_block = (Torus *)radix->ptr + start_lwe_index * lwe_size;
-  cuda_memset_async(lwe_array_out_block, 0,
-                    num_blocks_to_set * lwe_size * sizeof(Torus), stream,
-                    gpu_index);
-  memset(&radix->degrees[start_lwe_index], 0,
-         num_blocks_to_set * sizeof(uint64_t));
-  memset(&radix->noise_levels[start_lwe_index], 0,
-         num_blocks_to_set * sizeof(uint64_t));
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -2,11 +2,13 @@

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
-    uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {
+    void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {

  host_integer_radix_scalar_addition_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-      static_cast<const uint64_t *>(scalar_input), num_scalars, message_modulus,
-      carry_modulus);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array),
+      static_cast<const uint64_t *>(scalar_input), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -24,12 +24,12 @@ __global__ void device_integer_radix_scalar_addition_inplace(
 }

 template <typename Torus>
-__host__ void legacy_host_integer_radix_scalar_addition_inplace(
+__host__ void host_integer_radix_scalar_addition_inplace(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -49,42 +49,6 @@ __host__ void legacy_host_integer_radix_scalar_addition_inplace(
                                      delta);
  check_cuda_error(cudaGetLastError());
 }
-template <typename Torus>
-__host__ void host_integer_radix_scalar_addition_inplace(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
-    Torus const *scalar_input, uint32_t num_scalars, uint32_t message_modulus,
-    uint32_t carry_modulus) {
-  if (lwe_array->num_radix_blocks < num_scalars)
-    PANIC("Cuda error: num scalars should be smaller or equal to input num "
-          "radix blocks")
-  cuda_set_device(gpu_indexes[0]);
-
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_scalars;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_scalar_addition_inplace<Torus>
-      <<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr, scalar_input,
-                                      num_scalars, lwe_array->lwe_dimension,
-                                      delta);
-  check_cuda_error(cudaGetLastError());
-  Torus scalar_input_cpu[num_scalars];
-  cuda_memcpy_async_to_cpu(&scalar_input_cpu, scalar_input,
-                           num_scalars * sizeof(Torus), streams[0],
-                           gpu_indexes[0]);
-  for (uint i = 0; i < num_scalars; i++) {
-    lwe_array->degrees[i] = lwe_array->degrees[i] + scalar_input_cpu[i];
-  }
-}

 template <typename Torus>
 __global__ void device_integer_radix_add_scalar_one_inplace(
@@ -98,42 +62,17 @@ __global__ void device_integer_radix_add_scalar_one_inplace(
  }
 }

-template <typename Torus>
-__host__ void legacy_host_integer_radix_add_scalar_one_inplace(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension,
-    uint32_t num_radix_blocks, uint32_t message_modulus,
-    uint32_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
-
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_radix_blocks;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_add_scalar_one_inplace<Torus>
-      <<<grid, thds, 0, streams[0]>>>(lwe_array, num_radix_blocks,
-                                      lwe_dimension, delta);
-  check_cuda_error(cudaGetLastError());
-}
-
 template <typename Torus>
 __host__ void host_integer_radix_add_scalar_one_inplace(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
-    uint32_t message_modulus, uint32_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+    uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension,
+    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
-  int num_entries = lwe_array->num_radix_blocks;
+  int num_entries = input_lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
  dim3 grid(num_blocks, 1, 1);
  dim3 thds(num_threads, 1, 1);
@@ -144,13 +83,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

  device_integer_radix_add_scalar_one_inplace<Torus>
-      <<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr,
-                                      lwe_array->num_radix_blocks,
-                                      lwe_array->lwe_dimension, delta);
+      <<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
+                                      lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
-  for (uint i = 0; i < lwe_array->num_radix_blocks; i++) {
-    lwe_array->degrees[i] = lwe_array->degrees[i] + 1;
-  }
 }

 template <typename Torus>
@@ -173,7 +108,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
    uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -2,58 +2,15 @@

 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
+    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
+    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {

  host_integer_radix_scalar_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
-      num_clear_blocks, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
-}
-
-void update_degrees_after_scalar_bitand(uint64_t *output_degrees,
-                                        uint64_t *clear_degrees,
-                                        uint64_t *input_degrees,
-                                        uint32_t num_clear_blocks) {
-  for (uint i = 0; i < num_clear_blocks; i++) {
-    output_degrees[i] = std::min(clear_degrees[i], input_degrees[i]);
-  }
-}
-void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
-                                       uint64_t *clear_degrees,
-                                       uint64_t *input_degrees,
-                                       uint32_t num_clear_blocks) {
-  for (uint i = 0; i < num_clear_blocks; i++) {
-    auto max = std::max(clear_degrees[i], input_degrees[i]);
-    auto min = std::min(clear_degrees[i], input_degrees[i]);
-    auto result = max;
-
-    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
-        result = max | j;
-      }
-    }
-    output_degrees[i] = result;
-  }
-}
-void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,
-                                        uint64_t *clear_degrees,
-                                        uint64_t *input_degrees,
-                                        uint32_t num_clear_blocks) {
-  for (uint i = 0; i < num_clear_blocks; i++) {
-    auto max = std::max(clear_degrees[i], input_degrees[i]);
-    auto min = std::min(clear_degrees[i], input_degrees[i]);
-    auto result = max;
-
-    // Try every possibility to find the worst case
-    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
-        result = max ^ j;
-      }
-    }
-    output_degrees[i] = result;
-  }
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_input),
+      static_cast<const uint64_t *>(clear_blocks), num_clear_blocks,
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      lwe_ciphertext_count, op);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -7,60 +7,45 @@
 template <typename Torus>
 __host__ void host_integer_radix_scalar_bitop_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
-    uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks) {
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input,
+    Torus const *clear_blocks, uint32_t num_clear_blocks,
+    int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
+    uint32_t num_radix_blocks, BITOP_TYPE op) {

-  if (output->num_radix_blocks != input->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be equal")
-  if (output->lwe_dimension != input->lwe_dimension)
-    PANIC("Cuda error: input and output num radix blocks must be equal")
  auto lut = mem_ptr->lut;
-  auto op = mem_ptr->op;
-  auto num_radix_blocks = output->num_radix_blocks;
+  auto params = lut->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+
+  uint32_t lwe_size = big_lwe_dimension + 1;

  if (num_clear_blocks == 0) {
    if (op == SCALAR_BITAND) {
-      set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                                   output, 0, num_radix_blocks);
+      cuda_memset_async(lwe_array_out, 0,
+                        num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
+                        gpu_indexes[0]);
    } else {
-      if (input != output)
-        copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
-                                           input);
+      cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
+                                   num_radix_blocks * lwe_size * sizeof(Torus),
+                                   streams[0], gpu_indexes[0]);
    }
  } else {
    // We have all possible LUTs pre-computed and we use the decomposed scalar
    // as index to recover the right one
-    uint64_t degrees[num_clear_blocks];
-    uint64_t clear_degrees[num_clear_blocks];
-    cuda_memcpy_async_to_cpu(&clear_degrees, clear_blocks,
-                             num_clear_blocks * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
-    if (mem_ptr->op == BITOP_TYPE::SCALAR_BITAND) {
-      update_degrees_after_scalar_bitand(degrees, clear_degrees, input->degrees,
-                                         num_clear_blocks);
-    } else if (mem_ptr->op == BITOP_TYPE::SCALAR_BITOR) {
-      update_degrees_after_scalar_bitor(degrees, clear_degrees, input->degrees,
-                                        num_clear_blocks);
-    } else if (mem_ptr->op == SCALAR_BITXOR) {
-      update_degrees_after_scalar_bitxor(degrees, clear_degrees, input->degrees,
-                                         num_clear_blocks);
-    }
    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                 num_clear_blocks * sizeof(Torus), streams[0],
                                 gpu_indexes[0]);
    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, output, input, bsks, ksks, lut,
-        num_clear_blocks);
-    memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
+        ksks, num_clear_blocks, lut);

    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
-      set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                                   output, num_clear_blocks,
-                                                   num_radix_blocks);
+      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
+      cuda_memset_async(lwe_array_out_block, 0,
+                        (num_radix_blocks - num_clear_blocks) * lwe_size *
+                            sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -43,14 +43,14 @@ __host__ void scalar_compare_radix_blocks_kb(

  // Apply LUT to compare to 0
  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
      ksks, num_radix_blocks, sign_lut);

  // Add one
  // Here Lhs can have the following values: (-1) % (message modulus * carry
  // modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
-  legacy_host_integer_radix_add_scalar_one_inplace<Torus>(
+  host_integer_radix_add_scalar_one_inplace<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
      num_radix_blocks, message_modulus, carry_modulus);
 }
@@ -110,13 +110,13 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
-        lut->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_last_leaf_lut_f);
+    generate_device_accumulator<Torus>(streams[0], gpu_indexes[0],
+                                       lut->get_lut(0, 0), glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, scalar_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out,
        mem_ptr->tmp_lwe_array_out, bsks, ksks, 1, lut);

@@ -195,12 +195,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
-        lut->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -331,12 +331,12 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
-        lut->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
        sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -426,13 +426,12 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto signed_msb_lut = mem_ptr->signed_msb_lut;
    generate_device_accumulator_bivariate<Torus>(
        msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
-        signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, lut_f);
    signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);

    Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
        are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
        signed_msb_lut->params.message_modulus);
@@ -492,7 +491,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
        big_lwe_dimension, 1, 1, message_modulus, carry_modulus);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
        encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
        mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
@@ -541,10 +540,10 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(

  // Selector
  // CMUX for Max or Min
-  legacy_host_integer_radix_cmux_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-      total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                    lwe_array_out, sign, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -622,7 +621,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(

  // Selector
  // CMUX for Max or Min
-  legacy_host_integer_radix_cmux_kb<Torus>(
+  host_integer_radix_cmux_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out,
      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
@@ -686,7 +685,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
                                 lsb_streams[0], gpu_indexes[0]);
    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
        bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -47,6 +47,9 @@ __host__ void host_integer_scalar_mul_radix(
    void *const *bsks, T *const *ksks, uint32_t input_lwe_dimension,
    uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) {

+  if (num_radix_blocks == 0 | num_scalars == 0)
+    return;
+
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  uint32_t lwe_size = input_lwe_dimension + 1;
@@ -55,7 +58,7 @@ __host__ void host_integer_scalar_mul_radix(
  uint32_t num_ciphertext_bits = msg_bits * num_radix_blocks;

  T *preshifted_buffer = mem->preshifted_buffer;
-  T *all_shifted_buffer = (T *)mem->all_shifted_buffer->ptr;
+  T *all_shifted_buffer = mem->all_shifted_buffer;

  for (size_t shift_amount = 0; shift_amount < msg_bits; shift_amount++) {
    T *ptr = preshifted_buffer + shift_amount * lwe_size * num_radix_blocks;
@@ -63,7 +66,7 @@ __host__ void host_integer_scalar_mul_radix(
      cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
                                   lwe_size_bytes * num_radix_blocks,
                                   streams[0], gpu_indexes[0]);
-      legacy_host_integer_radix_logical_scalar_shift_kb_inplace<T>(
+      host_integer_radix_logical_scalar_shift_kb_inplace<T>(
          streams, gpu_indexes, gpu_count, ptr, shift_amount,
          mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
    } else {
@@ -80,7 +83,7 @@ __host__ void host_integer_scalar_mul_radix(
          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
      T *block_shift_buffer =
          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      legacy_host_radix_blocks_rotate_right<T>(
+      host_radix_blocks_rotate_right<T>(
          streams, gpu_indexes, gpu_count, block_shift_buffer,
          preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
      // create trivial assign for value = 0
@@ -114,7 +117,7 @@ __host__ void host_integer_scalar_mul_radix(
    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
    uint32_t uses_carry = 0;
-    legacy_host_propagate_single_carry<T>(
+    host_propagate_single_carry<T>(
        streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
        scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry);
  }
@@ -122,12 +125,12 @@ __host__ void host_integer_scalar_mul_radix(

 // Small scalar_mul is used in shift/rotate
 template <typename T>
-__host__ void host_legacy_integer_small_scalar_mul_radix(
+__host__ void host_integer_small_scalar_mul_radix(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, T *output_lwe_array, T *input_lwe_array, T scalar,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -143,42 +146,4 @@ __host__ void host_legacy_integer_small_scalar_mul_radix(
      input_lwe_ciphertext_count);
  check_cuda_error(cudaGetLastError());
 }
-
-// Small scalar_mul is used in shift/rotate
-template <typename T>
-__host__ void host_integer_small_scalar_mul_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
-    CudaRadixCiphertextFFI *input_lwe_array, T scalar) {
-
-  if (output_lwe_array->num_radix_blocks != input_lwe_array->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  if (output_lwe_array->lwe_dimension != input_lwe_array->lwe_dimension)
-    PANIC("Cuda error: input and output lwe_dimension must be the same")
-
-  cuda_set_device(gpu_indexes[0]);
-  auto lwe_dimension = input_lwe_array->lwe_dimension;
-  auto num_radix_blocks = input_lwe_array->num_radix_blocks;
-
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_radix_blocks * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  device_small_scalar_radix_multiplication<<<grid, thds, 0, streams[0]>>>(
-      (T *)output_lwe_array->ptr, (T *)input_lwe_array->ptr, scalar,
-      lwe_dimension, num_radix_blocks);
-  check_cuda_error(cudaGetLastError());
-
-  for (int i = 0; i < num_radix_blocks; i++) {
-    output_lwe_array->noise_levels[i] =
-        input_lwe_array->noise_levels[i] * scalar;
-    output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
-  }
-}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cu
@@ -22,13 +22,14 @@ void scratch_cuda_integer_radix_scalar_rotate_kb_64(

 void cuda_integer_radix_scalar_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, uint32_t n, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *lwe_array, uint32_t n, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks) {

  host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, n,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), n,
      (int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (uint64_t **)(ksks), num_blocks);
 }

 void cleanup_cuda_integer_radix_scalar_rotate(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -26,14 +26,18 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
 template <typename Torus>
 __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
+    uint32_t gpu_count, Torus *lwe_array, uint32_t n,
    int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
+    Torus *const *ksks, uint32_t num_blocks) {

-  auto num_blocks = lwe_array->num_radix_blocks;
  auto params = mem->params;
+  auto glwe_dimension = params.glwe_dimension;
+  auto polynomial_size = params.polynomial_size;
  auto message_modulus = params.message_modulus;

+  size_t big_lwe_size = glwe_dimension * polynomial_size + 1;
+  size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
+
  size_t num_bits_in_message = (size_t)log2_int(message_modulus);
  size_t total_num_bits = num_bits_in_message * num_blocks;
  n = n % total_num_bits;
@@ -44,7 +48,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  size_t rotations = n / num_bits_in_message;
  size_t shift_within_block = n % num_bits_in_message;

-  auto rotated_buffer = mem->tmp_rotated;
+  Torus *rotated_buffer = mem->tmp_rotated;

  // rotate right all the blocks in radix ciphertext
  // copy result in new buffer
@@ -55,11 +59,11 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    // rotate right as the blocks are from LSB to MSB
    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
                                          rotated_buffer, lwe_array, rotations,
-                                          num_blocks);
+                                          num_blocks, big_lwe_size);

-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                             lwe_array, 0, num_blocks,
-                                             rotated_buffer, 0, num_blocks);
+    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
+                                 num_blocks * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    if (shift_within_block == 0) {
      return;
@@ -69,24 +73,24 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    auto giver_blocks = rotated_buffer;
    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
                                          giver_blocks, lwe_array, 1,
-                                          num_blocks);
+                                          num_blocks, big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
-        giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
+        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
        lut_bivariate->params.message_modulus);

  } else {
    // rotate left as the blocks are from LSB to MSB
    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
                                         rotated_buffer, lwe_array, rotations,
-                                         num_blocks);
+                                         num_blocks, big_lwe_size);

-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                             lwe_array, 0, num_blocks,
-                                             rotated_buffer, 0, num_blocks);
+    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
+                                 num_blocks * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

    if (shift_within_block == 0) {
      return;
@@ -95,14 +99,14 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
-                                         giver_blocks, lwe_array, 1,
-                                         num_blocks);
+                                         giver_blocks, lwe_array, 1, num_blocks,
+                                         big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
-        giver_blocks, bsks, ksks, lut_bivariate, num_blocks,
+        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
        lut_bivariate->params.message_modulus);
  }
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Testé	cae938a75b	wip: measure latencies of a list of cts	2025-01-28 11:19:27 +01:00
David Testé	bae1d1cf77	WIP: fix gpu streams and use iter_batched	2025-01-22 10:56:08 +01:00
David Testé	a3bc1a9d9e	chore(bench): new heuristic to define elements for throughput This is done to fill up backend with enough elements to fill the backend and avoid having long execution time for heavy operations like multiplication or division.	2025-01-20 15:21:05 +01:00