wip: measure latencies of a list of cts

WIP: fix gpu streams and use iter_batched
chore(bench): new heuristic to define elements for throughput
2026-04-28 03:01:21 -04:00 · 2025-01-28 11:19:27 +01:00 · 2025-01-22 10:56:08 +01:00 · 2025-01-20 15:21:05 +01:00
289 changed files with 6566 additions and 10593 deletions
--- a/.github/workflows/aws_tfhe_backward_compat_tests.yml
+++ b/.github/workflows/aws_tfhe_backward_compat_tests.yml
@@ -11,47 +11,15 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'

 jobs:
-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (backward-compat-tests)
-    needs: check-user-permission
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -71,7 +39,7 @@ jobs:
    name: Backward compatibility tests
    needs: [ setup-instance ]
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -79,8 +47,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -123,7 +90,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Backward compatibility tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (backward-compat-tests)
@@ -147,4 +114,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (backward-compat-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_fast_tests.yml
+++ b/.github/workflows/aws_tfhe_fast_tests.yml
@@ -36,7 +36,7 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
@@ -62,13 +62,13 @@ jobs:
      user_docs_test: ${{ env.IS_PULL_REQUEST == 'false' ||
        steps.changed-files.outputs.user_docs_any_changed ||
        steps.changed-files.outputs.dependencies_any_changed }}
+      ci_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ci_any_changed }}
      any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
          ref: ${{ env.REF }}

@@ -76,6 +76,7 @@ jobs:
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
@@ -120,9 +121,13 @@ jobs:
              - '!tfhe/src/c_api/**'
              - 'tfhe/docs/**/**.md'
              - README.md
+            ci:
+              - .github/**
+              - ci/**

      - name: Aggregate file changes
        id: aggregated-changes
+        # CI files are not included in this aggregator.
        if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
          steps.changed-files.outputs.csprng_any_changed == 'true' ||
          steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
@@ -137,20 +142,13 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
  # Fail if the triggering actor is not part of Zama organization.
  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
  check-user-permission:
-    needs: check-ci-files
+    needs: should-run
    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
+      (github.event_name == 'pull_request_target' && needs.should-run.outputs.ci_file_changed == 'false')
+    uses: ./.github/workflows/check_triggering_actor.yml
    secrets:
      TOKEN: ${{ secrets.GITHUB_TOKEN }}

--- a/.github/workflows/aws_tfhe_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_integer_tests.yml
@@ -10,31 +10,16 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
  # We clear the cache to reduce memory pressure because of the numerous processes of cargo
  # nextest
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+    types: [labeled]
  push:
    branches:
      - main
@@ -43,11 +28,12 @@ jobs:
  should-run:
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event_name == 'pull_request_target' && contains(github.event.label.name, 'approved')) ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
        steps.changed-files.outputs.integer_any_changed }}
@@ -56,14 +42,14 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
@@ -75,30 +61,13 @@ jobs:
              - tfhe/src/integer/**
              - .github/workflows/aws_tfhe_integer_tests.yml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (unsigned-integer-tests)
-    needs: [ should-run, check-user-permission ]
+    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
@@ -119,7 +88,7 @@ jobs:
    name: Unsigned integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -127,8 +96,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: "false"
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -136,7 +104,7 @@ jobs:
          toolchain: stable

      - name: Should skip big parameters set
-        if: github.event_name == 'pull_request_target'
+        if: github.event_name == 'pull_request'
        run: |
          echo "NO_BIG_PARAMS=TRUE" >> "${GITHUB_ENV}"

@@ -162,7 +130,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (unsigned-integer-tests)
@@ -186,4 +154,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_signed_integer_tests.yml
+++ b/.github/workflows/aws_tfhe_signed_integer_tests.yml
@@ -10,31 +10,16 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
  # We clear the cache to reduce memory pressure because of the numerous processes of cargo
  # nextest
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
  NO_BIG_PARAMS: FALSE
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+    types: [labeled]
  push:
    branches:
      - main
@@ -44,11 +29,11 @@ jobs:
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      ((github.event_name == 'pull_request_target' || github.event_name == 'pull_request_target') && contains(github.event.label.name, 'approved')) ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      integer_test: ${{ github.event_name == 'workflow_dispatch' ||
        steps.changed-files.outputs.integer_any_changed }}
@@ -57,14 +42,14 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
+          persist-credentials: "false"

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            integer:
              - tfhe/Cargo.toml
@@ -76,30 +61,13 @@ jobs:
              - tfhe/src/integer/**
              - .github/workflows/aws_tfhe_signed_integer_tests.yml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (unsigned-integer-tests)
-    needs: [ should-run, check-user-permission ]
+    needs: should-run
    if:
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
-      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.integer_test == 'true') ||
+      (github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
      github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    outputs:
@@ -120,7 +88,7 @@ jobs:
    name: Signed integer tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -128,8 +96,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: "false"
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -137,7 +104,7 @@ jobs:
          toolchain: stable

      - name: Should skip big parameters set
-        if: github.event_name == 'pull_request_target'
+        if: github.event_name == 'pull_request'
        run: |
          echo "NO_BIG_PARAMS=TRUE" >> "${GITHUB_ENV}"

@@ -167,7 +134,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (signed-integer-tests)
@@ -191,4 +158,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_tests.yml
+++ b/.github/workflows/aws_tfhe_tests.yml
@@ -10,28 +10,13 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
  schedule:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"
@@ -42,7 +27,7 @@ jobs:
    if: github.event_name != 'schedule' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
      zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
@@ -78,14 +63,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            dependencies:
              - tfhe/Cargo.toml
@@ -147,28 +131,11 @@ jobs:
        run: |
          echo "any_changed=true" >> "$GITHUB_OUTPUT"

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cpu-tests)
-    if: github.event_name != 'pull_request_target' ||
+    if: github.event_name != 'pull_request' ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.any_file_changed == 'true')
-    needs: [ should-run, check-user-permission ]
+    needs: should-run
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -186,11 +153,11 @@ jobs:

  cpu-tests:
    name: CPU tests
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    needs: [ should-run, setup-instance ]
    concurrency:
-      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -198,8 +165,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -274,7 +240,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cpu-tests)
@@ -298,4 +264,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cpu-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/aws_tfhe_wasm_tests.yml
+++ b/.github/workflows/aws_tfhe_wasm_tests.yml
@@ -10,49 +10,16 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'

 jobs:
-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (wasm-tests)
-    needs: check-user-permission
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
@@ -73,7 +40,7 @@ jobs:
    name: WASM tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -81,8 +48,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -143,7 +109,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (wasm-tests)
@@ -167,4 +133,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (wasm-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_boolean.yml
+++ b/.github/workflows/benchmark_boolean.yml
@@ -51,8 +51,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -104,8 +103,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_core_crypto.yml
+++ b/.github/workflows/benchmark_core_crypto.yml
@@ -50,8 +50,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -95,8 +94,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_erc20.yml
+++ b/.github/workflows/benchmark_erc20.yml
@@ -52,8 +52,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -73,8 +72,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks
        run: |
--- a/.github/workflows/benchmark_gpu_4090.yml
+++ b/.github/workflows/benchmark_gpu_4090.yml
@@ -11,53 +11,20 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
  FAST_BENCH: TRUE
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+    types: [labeled]
  schedule:
    # Weekly benchmarks will be triggered each Friday at 9p.m.
    - cron: "0 21 * * 5"

 jobs:
-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  cuda-integer-benchmarks:
    name: Cuda integer benchmarks (RTX 4090)
-    needs: check-user-permission
    if: ${{ github.event_name == 'workflow_dispatch' ||
      github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs' ||
      contains(github.event.label.name, '4090_bench') }}
@@ -66,14 +33,16 @@ jobs:
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ["self-hosted", "4090-desktop"]
    timeout-minutes: 1440 # 24 hours
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -94,8 +63,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run integer benchmarks
        run: |
@@ -131,7 +99,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  cuda-core-crypto-benchmarks:
    name: Cuda core crypto benchmarks  (RTX 4090)
@@ -148,8 +116,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -169,8 +136,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run core crypto benchmarks
        run: |
@@ -216,11 +182,11 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  remove_github_label:
    name: Remove 4090 bench label
-    if: ${{ always() && github.event_name == 'pull_request_target' }}
+    if: ${{ always() && github.event_name == 'pull_request' }}
    needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
    runs-on: ubuntu-latest
    steps:
--- a/.github/workflows/benchmark_gpu_core_crypto.yml
+++ b/.github/workflows/benchmark_gpu_core_crypto.yml
@@ -53,8 +53,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -109,8 +108,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_gpu_erc20_common.yml
+++ b/.github/workflows/benchmark_gpu_erc20_common.yml
@@ -14,7 +14,7 @@ on:
        type: string
        required: true
    secrets:
-      REPO_CHECKOUT_TOKEN:
+      FHE_ACTIONS_TOKEN:
        required: true
      SLAB_ACTION_TOKEN:
        required: true
@@ -80,8 +80,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -135,8 +134,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_gpu_integer_common.yml
+++ b/.github/workflows/benchmark_gpu_integer_common.yml
@@ -26,7 +26,7 @@ on:
        type: boolean
        default: false
    secrets:
-      REPO_CHECKOUT_TOKEN:
+      FHE_ACTIONS_TOKEN:
        required: true
      SLAB_ACTION_TOKEN:
        required: true
@@ -150,8 +150,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -211,8 +210,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_integer.yml
+++ b/.github/workflows/benchmark_integer.yml
@@ -119,8 +119,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -140,8 +139,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
--- a/.github/workflows/benchmark_shortint.yml
+++ b/.github/workflows/benchmark_shortint.yml
@@ -82,8 +82,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -103,8 +102,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
--- a/.github/workflows/benchmark_signed_integer.yml
+++ b/.github/workflows/benchmark_signed_integer.yml
@@ -119,8 +119,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -140,8 +139,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Should run benchmarks with all precisions
        if: inputs.all_precisions
--- a/.github/workflows/benchmark_tfhe_fft.yml
+++ b/.github/workflows/benchmark_tfhe_fft.yml
@@ -94,8 +94,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_tfhe_ntt.yml
+++ b/.github/workflows/benchmark_tfhe_ntt.yml
@@ -94,8 +94,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_tfhe_zk_pok.yml
+++ b/.github/workflows/benchmark_tfhe_zk_pok.yml
@@ -38,6 +38,7 @@ jobs:
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            zk_pok:
              - tfhe-zk-pok/**
@@ -79,8 +80,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -100,8 +100,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks
        run: |
@@ -132,8 +131,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_wasm_client.yml
+++ b/.github/workflows/benchmark_wasm_client.yml
@@ -28,7 +28,7 @@ jobs:
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
    steps:
@@ -36,13 +36,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            wasm_bench:
              - tfhe/Cargo.toml
@@ -88,8 +88,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -177,8 +176,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/benchmark_zk_pke.yml
+++ b/.github/workflows/benchmark_zk_pke.yml
@@ -43,13 +43,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            zk_pok:
              - tfhe/Cargo.toml
@@ -130,8 +130,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Get benchmark details
        run: |
@@ -151,8 +150,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Run benchmarks with AVX512
        run: |
@@ -189,8 +187,7 @@ jobs:
        with:
          repository: zama-ai/slab
          path: slab
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Send data to Slab
        shell: bash
--- a/.github/workflows/check_actor_permissions.yml
+++ b/.github/workflows/check_actor_permissions.yml
@@ -1,39 +0,0 @@
-# Check if an actor is a collaborator and has write access
-name: Check Actor Permissions
-
-on:
-  workflow_call:
-    inputs:
-      username:
-        type: string
-        default: ${{ github.triggering_actor }}
-    outputs:
-      is_authorized:
-        value: ${{ jobs.check-actor-permission.outputs.actor_authorized }}
-    secrets:
-      TOKEN:
-        required: true
-
-jobs:
-  check-actor-permission:
-    runs-on: ubuntu-latest
-    outputs:
-      actor_authorized: ${{ steps.check-access.outputs.require-result }}
-    steps:
-      - name: Get User Permission
-        id: check-access
-        uses: actions-cool/check-user-permission@7b90a27f92f3961b368376107661682c441f6103 # v2.3.0
-        with:
-          require: write
-          username: ${{ inputs.username }}
-        env:
-          GITHUB_TOKEN: ${{ secrets.TOKEN }}
-
-      - name: Check User Permission
-        if: ${{ !(inputs.username == 'dependabot[bot]' || inputs.username == 'cla-bot[bot]') &&
-          steps.check-access.outputs.require-result == 'false' }}
-        run: |
-          echo "${{ inputs.username }} does not have permissions on this repo."
-          echo "Current permission level is ${{ steps.check-access.outputs.user-permission }}"
-          echo "Job originally triggered by ${{ github.actor }}"
-          exit 1
--- a/.github/workflows/check_ci_files_change.yml
+++ b/.github/workflows/check_ci_files_change.yml
@@ -1,40 +0,0 @@
-# Check if there is any change in CI files since last commit
-name: Check changes in CI files
-
-on:
-  workflow_call:
-    inputs:
-      checkout_ref:
-        type: string
-        required: true
-    outputs:
-      ci_file_changed:
-        value: ${{ jobs.check-changes.outputs.ci_file_changed }}
-    secrets:
-      REPO_CHECKOUT_TOKEN:
-        required: true
-
-jobs:
-  check-changes:
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: read
-    outputs:
-      ci_file_changed: ${{ steps.changed-files.outputs.ci_any_changed }}
-    steps:
-      - name: Checkout tfhe-rs
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ inputs.checkout_ref }}
-
-      - name: Check for file changes
-        id: changed-files
-        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
-        with:
-          files_yaml: |
-            ci:
-              - .github/**
-              - ci/**
--- a/.github/workflows/check_commit.yml
+++ b/.github/workflows/check_commit.yml
@@ -2,7 +2,6 @@
 name: Check commit and PR compliance
 on:
  pull_request:
-
 jobs:
  check-commit-pr:
    name: Check commit and PR
--- a/.github/workflows/check_external_pr.yml
+++ b/.github/workflows/check_external_pr.yml
@@ -1,32 +0,0 @@
-# Check if a pull request fulfill pre-conditions to be accepted
-name: Check PR from fork
-
-on:
-  pull_request_target:
-    paths:
-      - '.github/**'
-      - 'ci/**'
-
-jobs:
-  # Fail if the triggering actor is not part of Zama organization.
-  check-user-permission:
-    name: Check event user permissions
-    uses: ./.github/workflows/check_actor_permissions.yml
-    with:
-      username: ${{ github.event.pull_request.user.login }}
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  write-comment:
-    name: Write PR comment
-    if: ${{ always() && needs.check-user-permission.outputs.is_authorized == 'false' }}
-    needs: check-user-permission
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: write
-    steps:
-      - name: Write warning
-        uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b
-        with:
-          message: |
-            CI files have changed. Only Zama organization members are authorized to modify these files.
--- a/.github/workflows/check_triggering_actor.yml
+++ b/.github/workflows/check_triggering_actor.yml
@@ -0,0 +1,29 @@
+# Check if triggering actor is a collaborator and has write access
+name: Check Triggering Actor
+
+on:
+  workflow_call:
+    secrets:
+      TOKEN:
+        required: true
+
+jobs:
+  check-actor-permission:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Get User Permission
+        id: check-access
+        uses: actions-cool/check-user-permission@7b90a27f92f3961b368376107661682c441f6103 # v2.3.0
+        with:
+          require: write
+          username: ${{ github.triggering_actor }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.TOKEN }}
+
+      - name: Check User Permission
+        if: steps.check-access.outputs.require-result == 'false'
+        run: |
+          echo "${{ github.triggering_actor }} does not have permissions on this repo."
+          echo "Current permission level is ${{ steps.check-access.outputs.user-permission }}"
+          echo "Job originally triggered by ${{ github.actor }}"
+          exit 1
--- a/.github/workflows/ci_lint.yml
+++ b/.github/workflows/ci_lint.yml
@@ -14,9 +14,6 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

      - name: Get actionlint
        run: |
@@ -34,4 +31,3 @@ jobs:
        with:
          allowlist: |
            slsa-framework/slsa-github-generator
-            ./
--- a/.github/workflows/code_coverage.yml
+++ b/.github/workflows/code_coverage.yml
@@ -83,7 +83,7 @@ jobs:
          make test_shortint_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@13ce06bfc6bbe3ecf90edbbf1bc32fe5978ca1d3
+        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
@@ -97,7 +97,7 @@ jobs:
          make test_integer_cov

      - name: Upload tfhe coverage to Codecov
-        uses: codecov/codecov-action@13ce06bfc6bbe3ecf90edbbf1bc32fe5978ca1d3
+        uses: codecov/codecov-action@1e68e06f1dbfde0e4cefc87efeba9e4643565303
        if: steps.changed-files.outputs.tfhe_any_changed == 'true'
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
--- a/.github/workflows/csprng_randomness_tests.yml
+++ b/.github/workflows/csprng_randomness_tests.yml
@@ -10,49 +10,16 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'

 jobs:
-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (csprng-randomness-tests)
-    needs: check-user-permission
    if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
    runs-on: ubuntu-latest
    outputs:
@@ -73,7 +40,7 @@ jobs:
    name: CSPRNG randomness tests
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    steps:
@@ -81,8 +48,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -99,7 +65,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "tfhe-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (csprng-randomness-tests)
@@ -123,4 +89,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (csprng-randomness-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/data_pr_close.yml
+++ b/.github/workflows/data_pr_close.yml
@@ -8,8 +8,6 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
  PR_BRANCH: ${{ github.head_ref || github.ref_name }}
  CLOSE_TYPE: ${{ github.event.pull_request.merged && 'merge' || 'close' }}

@@ -17,8 +15,6 @@ env:
 on:
  pull_request:
    types: [ closed ]
-  pull_request_target:
-    types: [ closed ]

 # The same pattern is used for jobs that use the github api:
 # - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
--- a/.github/workflows/gpu_4090_tests.yml
+++ b/.github/workflows/gpu_4090_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an RTX 4090 machine
-name: Cuda - 4090 full tests
+name: TFHE Cuda Backend - 4090 full tests

 env:
  CARGO_TERM_COLOR: always
@@ -11,57 +11,24 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
  schedule:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"

 jobs:
-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  cuda-tests-linux:
    name: CUDA tests (RTX 4090)
-    needs: check-user-permission
    if: github.event_name == 'workflow_dispatch' ||
      contains(github.event.label.name, '4090_test') ||
      (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ["self-hosted", "4090-desktop"]

@@ -70,8 +37,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -103,7 +69,7 @@ jobs:
          make test_high_level_api_gpu

      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        if: ${{ always() && github.event_name == 'pull_request_target' }}
+        if: ${{ always() && github.event_name == 'pull_request' }}
        with:
          labels: 4090_test
          github_token: ${{ secrets.GITHUB_TOKEN }}
@@ -114,4 +80,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_h100_tests.yml
+++ b/.github/workflows/gpu_fast_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: Cuda - Fast tests on H100
+name: TFHE Cuda Backend - Fast tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -11,34 +11,19 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -46,14 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -72,27 +56,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-h100-tests)
-    needs: [ should-run, check-user-permission ]
-    if: github.event_name != 'pull_request_target' ||
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
@@ -113,10 +80,10 @@ jobs:
  cuda-tests-linux:
    name: CUDA H100 tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -132,8 +99,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -179,7 +145,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -203,4 +169,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_fast_tests.yml
+++ b/.github/workflows/gpu_fast_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Fast tests
+name: TFHE Cuda Backend - Fast tests

 env:
  CARGO_TERM_COLOR: always
@@ -11,32 +11,18 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -44,14 +30,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -70,27 +55,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-tests)
-    needs: [ should-run, check-user-permission ]
-    if: github.event_name == 'workflow_dispatch' ||
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
      needs.should-run.outputs.gpu_test == 'true'
    runs-on: ubuntu-latest
    outputs:
@@ -110,10 +78,10 @@ jobs:
  cuda-tests-linux:
    name: CUDA tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -129,8 +97,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -176,7 +143,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -200,4 +167,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_h100_tests.yml
+++ b/.github/workflows/gpu_full_h100_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
-name: Cuda - Full tests on H100
+name: TFHE Cuda Backend - Full tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -11,6 +11,7 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  workflow_dispatch:
@@ -65,7 +66,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -109,7 +110,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -132,4 +133,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_full_multi_gpu_tests.yml
+++ b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend on an AWS instance
-name: Cuda - Full tests multi-GPU
+name: TFHE Cuda Backend - Full tests multi-GPU

 env:
  CARGO_TERM_COLOR: always
@@ -11,34 +11,19 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -46,14 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -72,27 +56,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-tests-multi-gpu)
-    needs: [ should-run, check-user-permission ]
-    if: github.event_name != 'pull_request_target' ||
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
@@ -113,10 +80,10 @@ jobs:
  cuda-tests-linux:
    name: CUDA multi-GPU tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -132,8 +99,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -182,7 +148,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Multi-GPU tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests-multi-gpu)
@@ -206,4 +172,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-tests-multi-gpu) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_pcc.yml
+++ b/.github/workflows/gpu_pcc.yml
@@ -1,5 +1,5 @@
 # Perfom tfhe-cuda-backend post-commit checks on an AWS instance
-name: Cuda - Post-commit Checks
+name: TFHE Cuda Backend - Post-commit Checks

 env:
  CARGO_TERM_COLOR: always
@@ -11,45 +11,13 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'

 jobs:
-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-pcc)
-    needs: check-user-permission
    runs-on: ubuntu-latest
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
@@ -69,7 +37,7 @@ jobs:
    name: CUDA post-commit checks
    needs: setup-instance
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: true
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -88,8 +56,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
@@ -133,7 +100,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "CUDA AWS post-commit checks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-pcc)
@@ -157,4 +124,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-pcc) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-pcc) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_classic_tests.yml
+++ b/.github/workflows/gpu_signed_integer_classic_tests.yml
@@ -1,5 +1,5 @@
 # Signed integer GPU tests on an RTXA6000 VM on hyperstack with classical PBS
-name: Cuda - Signed integer tests with classical PBS
+name: TFHE Cuda Backend - Signed integer tests with classical PBS

 env:
  CARGO_TERM_COLOR: always
@@ -11,34 +11,19 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -46,14 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -72,27 +56,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-signed-classic-tests)
-    needs: [ should-run, check-user-permission ]
-    if: github.event_name != 'pull_request_target' ||
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
@@ -113,10 +80,10 @@ jobs:
  cuda-tests-linux:
    name: CUDA signed integer tests with classical PBS
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -130,10 +97,6 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -165,7 +128,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer GPU signed integer tests with classical PBS finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-signed-classic-tests)
@@ -189,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-signed-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_h100_tests.yml
+++ b/.github/workflows/gpu_signed_integer_h100_tests.yml
@@ -1,5 +1,5 @@
 # Signed integer GPU tests on an H100 VM on hyperstack
-name: Cuda - Signed integer tests on H100
+name: TFHE Cuda Backend - Signed integer tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -11,34 +11,19 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -46,14 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -72,27 +56,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-h100-tests)
-    needs: [ should-run, check-user-permission ]
-    if: github.event_name != 'pull_request_target' ||
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
@@ -113,10 +80,10 @@ jobs:
  cuda-tests-linux:
    name: CUDA H100 signed integer tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -130,10 +97,6 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -165,7 +128,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -189,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_signed_integer_tests.yml
+++ b/.github/workflows/gpu_signed_integer_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend signed integer on an AWS instance
-name: Cuda - Signed integer tests
+name: TFHE Cuda Backend - Signed integer tests

 env:
  CARGO_TERM_COLOR: always
@@ -11,28 +11,17 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
  FAST_TESTS: TRUE
  NIGHTLY_TESTS: FALSE
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+    types:
+      - opened
+      - synchronize
  schedule:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"
@@ -41,7 +30,7 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -49,14 +38,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -75,27 +63,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-signed-integer-tests)
    runs-on: ubuntu-latest
-    needs: [ should-run, check-user-permission ]
+    needs: should-run
    if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      github.event_name == 'workflow_dispatch' ||
      needs.should-run.outputs.gpu_test == 'true'
@@ -116,10 +87,10 @@ jobs:
  cuda-signed-integer-tests:
    name: CUDA signed integer tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -135,8 +106,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -176,7 +146,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-signed-integer-tests.result }}
-          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Base GPU tests finished with status: ${{ needs.cuda-signed-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -200,4 +170,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-signed-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-signed-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_classic_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_classic_tests.yml
@@ -1,5 +1,5 @@
 # Test unsigned integers on an RTXA6000 VM on hyperstack with the classical PBS
-name: Cuda - Unsigned integer tests with classical PBS
+name: TFHE Cuda Backend - Unsigned integer tests with classical PBS

 env:
  CARGO_TERM_COLOR: always
@@ -11,34 +11,19 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -46,14 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -72,27 +56,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-unsigned-classic-tests)
-    needs: [ should-run, check-user-permission ]
-    if: github.event_name == 'workflow_dispatch' ||
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
@@ -113,10 +80,10 @@ jobs:
  cuda-tests-linux:
    name: CUDA unsigned integer tests with classical PBS
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -130,10 +97,6 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -165,7 +128,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned integer GPU classic tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-unsigned-classic-tests)
@@ -189,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-classic-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_h100_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_h100_tests.yml
@@ -1,5 +1,5 @@
 # Test unsigned integers on an H100 VM on hyperstack
-name: Cuda - Unsigned integer tests on H100
+name: TFHE Cuda Backend - Unsigned integer tests on H100

 env:
  CARGO_TERM_COLOR: always
@@ -11,34 +11,19 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
-  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' || github.event_name == 'pull_request_target' }}
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}
+  IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+      types: [ labeled ]

 jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -46,14 +31,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -72,27 +56,10 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-h100-tests)
-    needs: [ should-run, check-user-permission ]
-    if: github.event_name == 'workflow_dispatch' ||
+    needs: should-run
+    if: github.event_name != 'pull_request' ||
      (github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
      (github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
    runs-on: ubuntu-latest
@@ -113,10 +80,10 @@ jobs:
  cuda-tests-linux:
    name: CUDA H100 unsigned integer tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -130,10 +97,6 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -165,7 +128,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
-          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned integer GPU H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-h100-tests)
@@ -189,4 +152,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/gpu_unsigned_integer_tests.yml
+++ b/.github/workflows/gpu_unsigned_integer_tests.yml
@@ -1,5 +1,5 @@
 # Compile and test tfhe-cuda-backend unsigned integer on an AWS instance
-name: Cuda - Unsigned integer tests
+name: TFHE Cuda Backend - Unsigned integer tests

 env:
  CARGO_TERM_COLOR: always
@@ -11,29 +11,16 @@ env:
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-  MSG_MINIMAL: event,action url,commit
-  BRANCH: ${{ github.head_ref || github.ref }}
  FAST_TESTS: TRUE
  NIGHTLY_TESTS: FALSE
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 on:
  # Allows you to run this workflow manually from the Actions tab as an alternative.
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+    types:
+      - opened
+      - synchronize
  schedule:
    # Nightly tests @ 1AM after each work day
    - cron: "0 1 * * MON-FRI"
@@ -42,7 +29,7 @@ jobs:
  should-run:
    runs-on: ubuntu-latest
    permissions:
-      pull-requests: read
+      pull-requests: write
    outputs:
      gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
    steps:
@@ -50,14 +37,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Check for file changes
        id: changed-files
        uses: tj-actions/changed-files@d6e91a2266cdb9d62096cebf1e8546899c6aa18f
        with:
+          since_last_remote_commit: true
          files_yaml: |
            gpu:
              - tfhe/Cargo.toml
@@ -76,26 +62,9 @@ jobs:
              - scripts/integer-tests.sh
              - ci/slab.toml

-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  setup-instance:
    name: Setup instance (cuda-unsigned-integer-tests)
-    needs: [ should-run, check-user-permission ]
+    needs: should-run
    if: (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
      github.event_name == 'workflow_dispatch' ||
      needs.should-run.outputs.gpu_test == 'true'
@@ -117,10 +86,10 @@ jobs:
  cuda-unsigned-integer-tests:
    name: CUDA unsigned integer tests
    needs: [ should-run, setup-instance ]
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.setup-instance.result != 'skipped')
+    if: github.event_name != 'pull_request' ||
+      (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
    concurrency:
-      group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+      group: ${{ github.workflow }}_${{ github.ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
@@ -134,10 +103,6 @@ jobs:
    steps:
      - name: Checkout tfhe-rs
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}

      - name: Setup Hyperstack dependencies
        uses: ./.github/actions/hyperstack_setup
@@ -177,7 +142,7 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ needs.cuda-unsigned-integer-tests.result }}
-          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Unsigned integer GPU tests finished with status: ${{ needs.cuda-unsigned-integer-tests.result }}. (${{ env.ACTION_RUN_URL }})"

  teardown-instance:
    name: Teardown instance (cuda-tests)
@@ -201,4 +166,4 @@ jobs:
        uses: rtCamp/action-slack-notify@c33737706dea87cd7784c687dadc9adf1be59990
        env:
          SLACK_COLOR: ${{ job.status }}
-          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-integer-tests) finished with status: ${{ job.status }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "Instance teardown (cuda-unsigned-integer-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/integer_long_run_tests.yml
+++ b/.github/workflows/integer_long_run_tests.yml
@@ -51,7 +51,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
--- a/.github/workflows/m1_tests.yml
+++ b/.github/workflows/m1_tests.yml
@@ -2,20 +2,8 @@ name: Tests on M1 CPU

 on:
  workflow_dispatch:
-  # Trigger pull_request event on CI files to be able to test changes before merging to main branch.
-  # Workflow would fail if changes come from a forked repository since secrets are not available with this event.
  pull_request:
-    types: [ labeled ]
-    paths:
-      - '.github/**'
-      - 'ci/**'
-  # General entry point for Zama's pull request as well as contribution from forks.
-  pull_request_target:
-    types: [ labeled ]
-    paths:
-      - '**'
-      - '!.github/**'
-      - '!ci/**'
+    types: [labeled]
  # Have a nightly build for M1 tests
  schedule:
    # * is a special character in YAML so you have to quote this string
@@ -33,35 +21,14 @@ env:
  # We clear the cache to reduce memory pressure because of the numerous processes of cargo
  # nextest
  TFHE_RS_CLEAR_IN_MEMORY_KEY_CACHE: "1"
-  REF: ${{ github.event.pull_request.head.sha || github.sha }}

 concurrency:
-  group: ${{ github.workflow }}_${{ github.head_ref || github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref }}
  cancel-in-progress: true

 jobs:
-  check-ci-files:
-    uses: ./.github/workflows/check_ci_files_change.yml
-    with:
-      checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
-    secrets:
-      REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-
-  # Fail if the triggering actor is not part of Zama organization.
-  # If pull_request_target is emitted and CI files have changed, skip this job. This would skip following jobs.
-  check-user-permission:
-    needs: check-ci-files
-    if: github.event_name != 'pull_request_target' ||
-      (github.event_name == 'pull_request_target' && needs.check-ci-files.outputs.ci_file_changed == 'false')
-    uses: ./.github/workflows/check_actor_permissions.yml
-    secrets:
-      TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
  cargo-builds-m1:
-    needs: check-user-permission
-    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') ||
-      github.event_name == 'workflow_dispatch' ||
-      contains(github.event.label.name, 'm1_test') }}
+    if: ${{ (github.event_name == 'schedule' &&  github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
    runs-on: ["self-hosted", "m1mac"]
    # 12 hours, default is 6 hours, hopefully this is more than enough
    timeout-minutes: 720
@@ -70,8 +37,6 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          persist-credentials: "false"
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
-          ref: ${{ env.REF }}

      - name: Install latest stable
        uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
@@ -213,7 +178,7 @@ jobs:
    if: ${{ always() }}
    steps:
      - uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
-        if: ${{ github.event_name == 'pull_request_target' }}
+        if: ${{ github.event_name == 'pull_request' }}
        with:
          labels: m1_test
          github_token: ${{ secrets.GITHUB_TOKEN }}
@@ -226,8 +191,6 @@ jobs:
          SLACK_COLOR: ${{ needs.cargo-builds-m1.result }}
          SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
          SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
-          SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds-m1.result }} on '${{ env.BRANCH }}'. (${{ env.ACTION_RUN_URL }})"
+          SLACK_MESSAGE: "M1 tests finished with status: ${{ needs.cargo-builds-m1.result }}. (${{ env.ACTION_RUN_URL }})"
          SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-          MSG_MINIMAL: event,action url,commit
-          BRANCH: ${{ github.head_ref || github.ref }}
--- a/.github/workflows/make_release.yml
+++ b/.github/workflows/make_release.yml
@@ -46,8 +46,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Prepare package
        run: |
          cargo package -p tfhe
@@ -83,8 +82,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Create NPM version tag
        if: ${{ inputs.npm_latest_tag }}
        run: |
--- a/.github/workflows/make_release_cuda.yml
+++ b/.github/workflows/make_release_cuda.yml
@@ -61,8 +61,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}

      - name: Set up home
        run: |
--- a/.github/workflows/make_release_tfhe_csprng.yml
+++ b/.github/workflows/make_release_tfhe_csprng.yml
@@ -30,7 +30,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-csprng
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: crate-tfhe-csprng
          path: target/package/*.crate
--- a/.github/workflows/make_release_tfhe_fft.yml
+++ b/.github/workflows/make_release_tfhe_fft.yml
@@ -33,7 +33,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-fft
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_tfhe_ntt.yml
+++ b/.github/workflows/make_release_tfhe_ntt.yml
@@ -33,7 +33,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-ntt
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0
        with:
          name: crate
          path: target/package/*.crate
--- a/.github/workflows/make_release_tfhe_versionable.yml
+++ b/.github/workflows/make_release_tfhe_versionable.yml
@@ -30,7 +30,7 @@ jobs:
      - name: Prepare package
        run: |
          cargo package -p tfhe-versionable-derive
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: crate-tfhe-versionable-derive
          path: target/package/*.crate
@@ -61,8 +61,6 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Download artifact
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
@@ -105,13 +103,13 @@ jobs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
        with:
          fetch-depth: 0
      - name: Prepare package
        run: |
          cargo package -p tfhe-versionable
-      - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+      - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: crate-tfhe-versionable
          path: target/package/*.crate
@@ -139,7 +137,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
        with:
          fetch-depth: 0
      - name: Download artifact
--- a/.github/workflows/make_release_zk_pok.yml
+++ b/.github/workflows/make_release_zk_pok.yml
@@ -24,7 +24,7 @@ jobs:
        - name: Prepare package
          run: |
            cargo package -p tfhe-zk-pok
-        - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        - uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
          with:
            name: crate-zk-pok
            path: target/package/*.crate
@@ -61,8 +61,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: Download artifact
        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
--- a/.github/workflows/sync_on_push.yml
+++ b/.github/workflows/sync_on_push.yml
@@ -16,8 +16,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
-          persist-credentials: 'false'
-          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+          token: ${{ secrets.FHE_ACTIONS_TOKEN }}
      - name: git-sync
        uses: wei/git-sync@55c6b63b4f21607da0e9877ca9b4d11a29fc6d83
        with:
--- a/30
+++ b/30
@@ -20,7 +20,7 @@ BENCH_OP_FLAVOR?=DEFAULT
 BENCH_TYPE?=latency
 NODE_VERSION=22.6
 BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
-BACKWARD_COMPAT_DATA_BRANCH?=$(shell ./scripts/backward_compat_data_version.py)
+BACKWARD_COMPAT_DATA_BRANCH?=v0.5
 BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
 BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
 TFHE_SPEC:=tfhe
@@ -282,14 +282,14 @@ check_typos: install_typos_checker
 .PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
 clippy_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: check_gpu # Run check on tfhe with "gpu" enabled
 check_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
-		--features=boolean,shortint,integer,internal-keycache,gpu \
+		--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats \
 		--all-targets \
 		-p $(TFHE_SPEC)

@@ -394,10 +394,10 @@ clippy_trivium: install_rs_check_toolchain
 .PHONY: clippy_all_targets # Run clippy lints on all targets (benches, examples, etc.)
 clippy_all_targets: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings
 	RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,experimental \
+		--features=boolean,shortint,integer,internal-keycache,zk-pok,strings,pbs-stats,experimental \
 		-p $(TFHE_SPEC) -- --no-deps -D warnings

 .PHONY: clippy_tfhe_csprng # Run clippy lints on tfhe-csprng
@@ -1056,35 +1056,35 @@ bench_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer # Run benchmarks for signed integer
 bench_signed_integer: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
 bench_integer_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
 bench_integer_compression: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_compression_gpu
 bench_integer_compression_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench	glwe_packing_compression-integer-bench \
-	--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,gpu,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
 bench_integer_multi_bit: install_rs_check_toolchain
@@ -1092,7 +1092,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
 bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -1100,7 +1100,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-signed-bench \
-	--features=integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
 bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1108,7 +1108,7 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) --

 .PHONY: bench_unsigned_integer_multi_bit_gpu # Run benchmarks for unsigned integer on GPU backend using multi-bit parameters
 bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -1116,14 +1116,14 @@ bench_unsigned_integer_multi_bit_gpu: install_rs_check_toolchain
 	__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench integer-bench \
-	--features=integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) -- ::unsigned
+	--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p $(TFHE_SPEC) -- ::unsigned

 .PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
 bench_integer_zk: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
 	cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench zk-pke-bench \
-	--features=integer,internal-keycache,zk-pok,nightly-avx512 \
+	--features=integer,internal-keycache,zk-pok,nightly-avx512,pbs-stats \
 	-p $(TFHE_SPEC) --

 .PHONY: bench_shortint # Run benchmarks for shortint
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -27,8 +27,6 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
    std::abort();                                                              \
  }

-void cuda_set_device(uint32_t gpu_index);
-
 cudaEvent_t cuda_create_event(uint32_t gpu_index);

 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -112,8 +112,6 @@ template <typename Torus> struct int_decompression {

      generate_device_accumulator_with_encoding<Torus>(
          streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0),
-          decompression_rescale_lut->get_degree(0),
-          decompression_rescale_lut->get_max_degree(0),
          encryption_params.glwe_dimension, encryption_params.polynomial_size,
          effective_compression_message_modulus,
          effective_compression_carry_modulus,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h
@@ -38,15 +38,6 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
 enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

 extern "C" {
-
-typedef struct {
-  void *ptr;
-  uint64_t *degrees;
-  uint64_t *noise_levels;
-  uint32_t num_radix_blocks;
-  uint32_t lwe_dimension;
-} CudaRadixCiphertextFFI;
-
 void scratch_cuda_apply_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -54,7 +45,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    bool allocate_gpu_memory);
 void scratch_cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
    int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
@@ -62,12 +53,13 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory);
-void cuda_apply_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks);
+    uint32_t num_many_lut, bool allocate_gpu_memory);
+void cuda_apply_univariate_lut_kb_64(void *const *streams,
+                                     uint32_t const *gpu_indexes,
+                                     uint32_t gpu_count, void *output_radix_lwe,
+                                     void const *input_radix_lwe,
+                                     int8_t *mem_ptr, void *const *ksks,
+                                     void *const *bsks, uint32_t num_blocks);

 void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
                                             uint32_t const *gpu_indexes,
@@ -81,15 +73,13 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    bool allocate_gpu_memory);

 void cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe_1,
-    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
-    uint32_t shift);
+    void *output_radix_lwe, void const *input_radix_lwe_1,
+    void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
+    void *const *bsks, uint32_t num_blocks, uint32_t shift);

 void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
                                            uint32_t const *gpu_indexes,
@@ -98,10 +88,9 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,

 void cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_luts,
-    uint32_t lut_stride);
+    void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks,
+    uint32_t num_luts, uint32_t lut_stride);

 void scratch_cuda_full_propagation_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -143,14 +132,15 @@ void cleanup_cuda_integer_mult(void *const *streams,

 void cuda_negate_integer_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
+    void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus);

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
-    uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus);
+    void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus);

 void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -199,8 +189,8 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(

 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks);
+    void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);

 void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
                                                 uint32_t const *gpu_indexes,
@@ -243,17 +233,15 @@ void scratch_cuda_integer_radix_bitop_kb_64(

 void cuda_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t lwe_ciphertext_count);

 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
+    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks);
+    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op);

 void cleanup_cuda_integer_bitop(void *const *streams,
                                uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -270,11 +258,9 @@ void scratch_cuda_integer_radix_cmux_kb_64(

 void cuda_cmux_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks);
+    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
+    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t lwe_ciphertext_count);

 void cleanup_cuda_integer_radix_cmux(void *const *streams,
                                     uint32_t const *gpu_indexes,
@@ -319,16 +305,15 @@ void scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
-    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry);

 void cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
-    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t requested_flag, uint32_t uses_carry);
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry);

 void cleanup_cuda_propagate_single_carry(void *const *streams,
                                         uint32_t const *gpu_indexes,
@@ -426,13 +411,12 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory);
+    bool allocate_gpu_memory);

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_blocks);
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift);

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -440,8 +424,9 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
                                            uint32_t const *gpu_indexes,
-                                            uint32_t gpu_count,
-                                            CudaRadixCiphertextFFI *lwe_array);
+                                            uint32_t gpu_count, void *lwe_array,
+                                            uint32_t num_blocks,
+                                            uint32_t lwe_size);

 void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -454,8 +439,8 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
-    void *const *bsks, void *const *ksks);
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks);

 void cleanup_cuda_integer_abs_inplace(void *const *streams,
                                      uint32_t const *gpu_indexes,
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
--- a/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
@@ -1,8 +0,0 @@
-#ifndef CUDA_RADIX_CIPHERTEXT_H
-#define CUDA_RADIX_CIPHERTEXT_H
-
-void release_radix_ciphertext(cudaStream_t const stream,
-                              uint32_t const gpu_index,
-                              CudaRadixCiphertextFFI *data);
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
+++ b/backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
@@ -1,7 +1,6 @@
 #ifndef CUDA_LINALG_H_
 #define CUDA_LINALG_H_

-#include "integer/integer.h"
 #include <stdint.h>

 extern "C" {
@@ -14,14 +13,17 @@ void cuda_negate_lwe_ciphertext_vector_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
-                                       CudaRadixCiphertextFFI *output,
-                                       CudaRadixCiphertextFFI const *input_1,
-                                       CudaRadixCiphertextFFI const *input_2);
-void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
-                                       CudaRadixCiphertextFFI *output,
-                                       CudaRadixCiphertextFFI const *input_1,
-                                       CudaRadixCiphertextFFI const *input_2);
+void cuda_add_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+void cuda_add_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count);
+
 void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
    void const *lwe_array_in, void const *plaintext_array_in,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_multibit_utilities.h
@@ -5,12 +5,12 @@

 template <typename Torus>
 bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory);
+    uint32_t polynomial_size);

 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, int max_shared_memory);
+    uint32_t level_count);

 #if CUDA_ARCH >= 900
 template <typename Torus>
@@ -114,8 +114,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
             PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
-    cuda_set_device(gpu_index);
-
    this->pbs_variant = pbs_variant;
    this->lwe_chunk_size = lwe_chunk_size;
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -61,7 +61,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {

 template <typename Torus>
 bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, int max_shared_memory);
+    uint32_t polynomial_size);

 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

@@ -77,10 +77,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
             uint32_t polynomial_size, uint32_t level_count,
             uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
             bool allocate_gpu_memory) {
-    cuda_set_device(gpu_index);
+
    this->pbs_variant = pbs_variant;

-    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+    auto max_shared_memory = cuda_get_max_shared_memory(0);

    if (allocate_gpu_memory) {
      switch (pbs_variant) {
@@ -157,7 +157,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

        bool supports_dsm =
            supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                Torus>(polynomial_size, max_shared_memory);
+                Torus>(polynomial_size);

        uint64_t full_sm =
            get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
@@ -218,7 +218,8 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 template <typename Torus>
 uint64_t get_buffer_size_programmable_bootstrap_cg(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
+  int max_shared_memory = cuda_get_max_shared_memory(0);
  uint64_t full_sm =
      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
  uint64_t partial_sm =
@@ -244,8 +245,7 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                   uint32_t polynomial_size,
                                                   uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   int max_shared_memory);
+                                                   uint32_t num_samples);

 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/programmable_bootstrap_multibit.h
@@ -8,7 +8,7 @@ extern "C" {

 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, int max_shared_memory);
+    uint32_t num_samples);

 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
    void *stream, uint32_t gpu_index, void *dest, void const *src,
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -11,7 +11,7 @@ void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
                                               uint32_t gpu_index, T *dest,
                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
 }
@@ -21,7 +21,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
                                               uint32_t gpu_index, T *dest,
                                               T *src, uint32_t number_of_cts,
                                               uint32_t lwe_dimension) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
 }
@@ -55,7 +55,7 @@ __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                  Torus const *glwe_array_in,
                                  uint32_t const *nth_array, uint32_t num_nths,
                                  uint32_t glwe_dimension) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  dim3 grid(num_nths);
  dim3 thds(params::degree / params::opt);
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/fast_packing_keyswitch.cuh
@@ -261,7 +261,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(

  // Optimization of packing keyswitch when packing many LWEs

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  check_cuda_error(cudaGetLastError());

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -57,7 +57,7 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
  if (gpu_count != 1)
    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
          "supported yet.")
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  int shared_memory_size = sizeof(double) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -105,7 +105,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
    uint32_t num_samples) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  constexpr int num_threads_y = 32;
  int num_blocks, num_threads_x;
@@ -160,7 +160,7 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t num_lwes, bool allocate_gpu_memory) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -110,7 +110,7 @@ template <typename Torus>
 __host__ void host_modulus_switch_inplace(cudaStream_t stream,
                                          uint32_t gpu_index, Torus *array,
                                          int size, uint32_t log_modulus) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  int num_threads = 0, num_blocks = 0;
  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -2,12 +2,8 @@
 #include <cstdint>
 #include <cuda_runtime.h>

-void cuda_set_device(uint32_t gpu_index) {
-  check_cuda_error(cudaSetDevice(gpu_index));
-}
-
 cudaEvent_t cuda_create_event(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  cudaEvent_t event;
  check_cuda_error(cudaEventCreate(&event));
  return event;
@@ -15,24 +11,24 @@ cudaEvent_t cuda_create_event(uint32_t gpu_index) {

 void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
                       uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaEventRecord(event, stream));
 }

 void cuda_stream_wait_event(cudaStream_t stream, cudaEvent_t event,
                            uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaStreamWaitEvent(stream, event, 0));
 }

 void cuda_event_destroy(cudaEvent_t event, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaEventDestroy(event));
 }

 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cudaStream_t cuda_create_stream(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  cudaStream_t stream;
  check_cuda_error(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  return stream;
@@ -40,22 +36,15 @@ cudaStream_t cuda_create_stream(uint32_t gpu_index) {

 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
 void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaStreamDestroy(stream));
 }

 void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaStreamSynchronize(stream));
 }

-void synchronize_streams(cudaStream_t const *streams,
-                         uint32_t const *gpu_indexes, uint32_t gpu_count) {
-  for (uint i = 0; i < gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-  }
-}
-
 // Determine if a CUDA device is available at runtime
 uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }

@@ -63,7 +52,7 @@ uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  void *ptr;
  check_cuda_error(cudaMalloc((void **)&ptr, size));

@@ -74,7 +63,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cudaStream_t stream,
                        uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  void *ptr;

 #ifndef CUDART_VERSION
@@ -97,7 +86,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream,

 /// Check that allocation is valid
 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  size_t total_mem, free_mem;
  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
  if (size > free_mem) {
@@ -145,7 +134,7 @@ void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
    PANIC("Cuda error: invalid device pointer in async copy to GPU.")
  }

-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
 }
@@ -165,7 +154,7 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(
        cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
@@ -190,7 +179,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
  if (attr_src.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
  }
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  if (attr_src.device == attr_dest.device) {
    check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
  } else {
@@ -201,7 +190,7 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,

 /// Synchronizes device
 void cuda_synchronize_device(uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaDeviceSynchronize());
 }

@@ -214,7 +203,7 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
    PANIC("Cuda error: invalid dest device pointer in cuda memset.")
  }
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
 }

@@ -234,7 +223,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
    if (attr.type != cudaMemoryTypeDevice) {
      PANIC("Cuda error: invalid dest device pointer in cuda set value.")
    }
-    cuda_set_device(gpu_index);
+    check_cuda_error(cudaSetDevice(gpu_index));
    int block_size = 256;
    int num_blocks = (n + block_size - 1) / block_size;

@@ -264,7 +253,7 @@ void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
    PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
  }

-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(
      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream));
 }
@@ -278,14 +267,14 @@ int cuda_get_number_of_gpus() {

 /// Drop a cuda array
 void cuda_drop(void *ptr, uint32_t gpu_index) {
-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
  check_cuda_error(cudaFree(ptr));
 }

 /// Drop a cuda array asynchronously, if supported on the device
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index) {

-  cuda_set_device(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
@@ -22,14 +22,15 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

 void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, bool is_signed,
-    void *const *bsks, void *const *ksks) {
+    void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks) {

  auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

  host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
-                                gpu_count, ct, bsks, (uint64_t **)(ksks), mem,
-                                is_signed);
+                                gpu_count, static_cast<uint64_t *>(ct), bsks,
+                                (uint64_t **)(ksks), mem, is_signed,
+                                num_blocks);
 }

 void cleanup_cuda_integer_abs_inplace(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
@@ -2,12 +2,15 @@
 #define TFHE_RS_ABS_CUH

 #include "crypto/keyswitch.cuh"
+#include "device.h"
 #include "integer/bitwise_ops.cuh"
 #include "integer/comparison.cuh"
 #include "integer/integer.cuh"
+#include "integer/integer_utilities.h"
 #include "integer/negation.cuh"
 #include "integer/scalar_shifts.cuh"
-#include "radix_ciphertext.cuh"
+#include "linear_algebra.h"
+#include "pbs/programmable_bootstrap.h"
 #include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
@@ -29,15 +32,16 @@ __host__ void scratch_cuda_integer_abs_kb(
 }

 template <typename Torus>
-__host__ void legacy_host_integer_abs_kb_async(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
-    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
+__host__ void
+host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+                    uint32_t gpu_count, Torus *ct, void *const *bsks,
+                    uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
+                    bool is_signed, uint32_t num_blocks) {
  if (!is_signed)
    return;

  auto radix_params = mem_ptr->params;
-  auto mask = (Torus *)(mem_ptr->mask->ptr);
+  auto mask = mem_ptr->mask;

  auto big_lwe_dimension = radix_params.big_lwe_dimension;
  auto big_lwe_size = big_lwe_dimension + 1;
@@ -48,56 +52,20 @@ __host__ void legacy_host_integer_abs_kb_async(
  cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
                               streams[0], gpu_indexes[0]);

-  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
+  host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
      mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                              radix_params.big_lwe_dimension, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
+                       radix_params.big_lwe_dimension, num_blocks);

  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
-  legacy_host_propagate_single_carry<Torus>(
+  host_propagate_single_carry<Torus>(
      streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
      bsks, ksks, num_blocks, requested_flag, uses_carry);

-  // legacy bitop
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, ct, mask, ct, bsks, ksks, num_blocks,
-      mem_ptr->bitxor_mem->lut, mem_ptr->bitxor_mem->params.message_modulus);
-}
-
-template <typename Torus>
-__host__ void
-host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
-                    uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
-                    void *const *bsks, uint64_t *const *ksks,
-                    int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
-  if (!is_signed)
-    return;
-
-  auto mask = mem_ptr->mask;
-
-  uint32_t num_bits_in_ciphertext =
-      (31 - __builtin_clz(mem_ptr->params.message_modulus)) *
-      ct->num_radix_blocks;
-
-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], mask, ct);
-
-  host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
-      streams, gpu_indexes, gpu_count, (Torus *)(mask->ptr),
-      num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks,
-      ksks, ct->num_radix_blocks);
-  host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
-                       ct->num_radix_blocks);
-
-  uint32_t requested_flag = outputFlag::FLAG_NONE;
-  uint32_t uses_carry = 0;
-  host_propagate_single_carry<Torus>(streams, gpu_indexes, gpu_count, ct,
-                                     nullptr, nullptr, mem_ptr->scp_mem, bsks,
-                                     ksks, requested_flag, uses_carry);
-
-  host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
-                                     ct, mem_ptr->bitxor_mem, bsks, ksks);
+  host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
+                              mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
 }

 #endif // TFHE_RS_ABS_CUH
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cu
@@ -22,15 +22,17 @@ void scratch_cuda_integer_radix_bitop_kb_64(

 void cuda_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *lwe_array_out, void const *lwe_array_1, void const *lwe_array_2,
+    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t lwe_ciphertext_count) {

  host_integer_radix_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_array_1, lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_1),
+      static_cast<const uint64_t *>(lwe_array_2),
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      lwe_ciphertext_count);
 }

 void cleanup_cuda_integer_bitop(void *const *streams,
@@ -41,50 +43,3 @@ void cleanup_cuda_integer_bitop(void *const *streams,
      (int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
 }
-
-void update_degrees_after_bitand(uint64_t *output_degrees,
-                                 uint64_t *lwe_array_1_degrees,
-                                 uint64_t *lwe_array_2_degrees,
-                                 uint32_t num_radix_blocks) {
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    output_degrees[i] =
-        std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-  }
-}
-
-void update_degrees_after_bitor(uint64_t *output_degrees,
-                                uint64_t *lwe_array_1_degrees,
-                                uint64_t *lwe_array_2_degrees,
-                                uint32_t num_radix_blocks) {
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    auto max = std::max(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto min = std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto result = max;
-
-    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
-        result = max | j;
-      }
-    }
-    output_degrees[i] = result;
-  }
-}
-
-void update_degrees_after_bitxor(uint64_t *output_degrees,
-                                 uint64_t *lwe_array_1_degrees,
-                                 uint64_t *lwe_array_2_degrees,
-                                 uint32_t num_radix_blocks) {
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    auto max = std::max(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto min = std::min(lwe_array_1_degrees[i], lwe_array_2_degrees[i]);
-    auto result = max;
-
-    // Try every possibility to find the worst case
-    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
-        result = max ^ j;
-      }
-    }
-    output_degrees[i] = result;
-  }
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -14,34 +14,15 @@
 template <typename Torus>
 __host__ void host_integer_radix_bitop_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_1,
-    CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks) {
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_1,
+    Torus const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
+    void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks) {

  auto lut = mem_ptr->lut;
-  uint64_t degrees[lwe_array_1->num_radix_blocks];
-  if (mem_ptr->op == BITOP_TYPE::BITAND) {
-    update_degrees_after_bitand(degrees, lwe_array_1->degrees,
-                                lwe_array_2->degrees,
-                                lwe_array_1->num_radix_blocks);
-  } else if (mem_ptr->op == BITOP_TYPE::BITOR) {
-    update_degrees_after_bitor(degrees, lwe_array_1->degrees,
-                               lwe_array_2->degrees,
-                               lwe_array_1->num_radix_blocks);
-  } else if (mem_ptr->op == BITXOR) {
-    update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
-                                lwe_array_2->degrees,
-                                lwe_array_1->num_radix_blocks);
-  }

  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_1, lwe_array_2,
-      bsks, ksks, lut, lwe_array_out->num_radix_blocks,
-      lut->params.message_modulus);
-
-  memcpy(lwe_array_out->degrees, degrees,
-         lwe_array_out->num_radix_blocks * sizeof(uint64_t));
+      bsks, ksks, num_radix_blocks, lut, lut->params.message_modulus);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
@@ -25,16 +25,19 @@ void scratch_cuda_integer_radix_cmux_kb_64(

 void cuda_cmux_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
-    void *const *bsks, void *const *ksks) {
+    void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
+    void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t lwe_ciphertext_count) {

  host_integer_radix_cmux_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_condition, lwe_array_true, lwe_array_false,
-      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_condition),
+      static_cast<const uint64_t *>(lwe_array_true),
+      static_cast<const uint64_t *>(lwe_array_false),
+      (int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+
+      lwe_ciphertext_count);
 }

 void cleanup_cuda_integer_radix_cmux(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -2,7 +2,6 @@
 #define CUDA_INTEGER_CMUX_CUH

 #include "integer.cuh"
-#include "radix_ciphertext.cuh"

 template <typename Torus>
 __host__ void zero_out_if(cudaStream_t const *streams,
@@ -12,7 +11,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
                          int_zero_out_if_buffer<Torus> *mem_ptr,
                          int_radix_lut<Torus> *predicate, void *const *bsks,
                          Torus *const *ksks, uint32_t num_radix_blocks) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  auto params = mem_ptr->params;

  // We can't use integer_radix_apply_bivariate_lookup_table_kb since the
@@ -24,13 +23,13 @@ __host__ void zero_out_if(cudaStream_t const *streams,
      predicate->lwe_indexes_in, params.big_lwe_dimension,
      params.message_modulus, num_radix_blocks);

-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
      ksks, num_radix_blocks, predicate);
 }

 template <typename Torus>
-__host__ void legacy_host_integer_radix_cmux_kb(
+__host__ void host_integer_radix_cmux_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_condition,
    Torus const *lwe_array_true, Torus const *lwe_array_false,
@@ -40,88 +39,34 @@ __host__ void legacy_host_integer_radix_cmux_kb(
  auto params = mem_ptr->params;
  Torus lwe_size = params.big_lwe_dimension + 1;
  Torus radix_lwe_size = lwe_size * num_radix_blocks;
-  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in->ptr, lwe_array_true,
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in, lwe_array_true,
                               radix_lwe_size * sizeof(Torus), streams[0],
                               gpu_indexes[0]);
-  cuda_memcpy_async_gpu_to_gpu(
-      (Torus *)(mem_ptr->buffer_in->ptr) + radix_lwe_size, lwe_array_false,
-      radix_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+  cuda_memcpy_async_gpu_to_gpu(mem_ptr->buffer_in + radix_lwe_size,
+                               lwe_array_false, radix_lwe_size * sizeof(Torus),
+                               streams[0], gpu_indexes[0]);
  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    cuda_memcpy_async_gpu_to_gpu(
-        (Torus *)(mem_ptr->condition_array->ptr) + i * lwe_size, lwe_condition,
-        lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
-  }
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, (Torus *)(mem_ptr->buffer_out->ptr),
-      (Torus *)(mem_ptr->buffer_in->ptr),
-      (Torus *)(mem_ptr->condition_array->ptr), bsks, ksks,
-      2 * num_radix_blocks, mem_ptr->predicate_lut, params.message_modulus);
-
-  // If the condition was true, true_ct will have kept its value and false_ct
-  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
-  // have kept its value
-  auto mem_true = (Torus *)(mem_ptr->buffer_out->ptr);
-  auto ptr = (Torus *)mem_ptr->buffer_out->ptr;
-  auto mem_false = &ptr[radix_lwe_size];
-  auto added_cts = mem_true;
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
-                              mem_false, params.big_lwe_dimension,
-                              num_radix_blocks);
-
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
-      num_radix_blocks, mem_ptr->message_extract_lut);
-}
-
-template <typename Torus>
-__host__ void host_integer_radix_cmux_kb(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_condition,
-    CudaRadixCiphertextFFI const *lwe_array_true,
-    CudaRadixCiphertextFFI const *lwe_array_false,
-    int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks) {
-
-  if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  if (lwe_array_out->num_radix_blocks != lwe_array_false->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-
-  auto num_radix_blocks = lwe_array_out->num_radix_blocks;
-  auto params = mem_ptr->params;
-  Torus lwe_size = params.big_lwe_dimension + 1;
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams[0], gpu_indexes[0], mem_ptr->buffer_in, 0, num_radix_blocks,
-      lwe_array_true, 0, num_radix_blocks);
-  copy_radix_ciphertext_slice_async<Torus>(
-      streams[0], gpu_indexes[0], mem_ptr->buffer_in, num_radix_blocks,
-      2 * num_radix_blocks, lwe_array_false, 0, num_radix_blocks);
-  for (uint i = 0; i < 2 * num_radix_blocks; i++) {
-    copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                             mem_ptr->condition_array, i, i + 1,
-                                             lwe_condition, 0, 1);
+    cuda_memcpy_async_gpu_to_gpu(mem_ptr->condition_array + i * lwe_size,
+                                 lwe_condition, lwe_size * sizeof(Torus),
+                                 streams[0], gpu_indexes[0]);
  }
  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, mem_ptr->buffer_out, mem_ptr->buffer_in,
-      mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
-      2 * num_radix_blocks, params.message_modulus);
+      mem_ptr->condition_array, bsks, ksks, 2 * num_radix_blocks,
+      mem_ptr->predicate_lut, params.message_modulus);

  // If the condition was true, true_ct will have kept its value and false_ct
  // will be 0 If the condition was false, true_ct will be 0 and false_ct will
  // have kept its value
-  CudaRadixCiphertextFFI mem_true;
-  CudaRadixCiphertextFFI mem_false;
-  as_radix_ciphertext_slice<Torus>(&mem_true, mem_ptr->buffer_out, 0,
-                                   num_radix_blocks);
-  as_radix_ciphertext_slice<Torus>(&mem_false, mem_ptr->buffer_out,
-                                   num_radix_blocks, 2 * num_radix_blocks);
-
-  host_addition<Torus>(streams[0], gpu_indexes[0], &mem_true, &mem_true,
-                       &mem_false, num_radix_blocks);
+  auto mem_true = mem_ptr->buffer_out;
+  auto mem_false = &mem_ptr->buffer_out[radix_lwe_size];
+  auto added_cts = mem_true;
+  host_addition<Torus>(streams[0], gpu_indexes[0], added_cts, mem_true,
+                       mem_false, params.big_lwe_dimension, num_radix_blocks);

  integer_radix_apply_univariate_lookup_table_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, &mem_true, bsks, ksks,
-      mem_ptr->message_extract_lut, num_radix_blocks);
+      streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
+      num_radix_blocks, mem_ptr->message_extract_lut);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -38,7 +38,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
                                    uint32_t lwe_dimension,
                                    uint32_t num_radix_blocks) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = (lwe_dimension + 1);
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -122,9 +122,7 @@ __host__ void are_all_comparisons_block_true(
        };
        generate_device_accumulator<Torus>(
            streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1),
-            is_max_value_lut->get_degree(1),
-            is_max_value_lut->get_max_degree(1), glwe_dimension,
-            polynomial_size, message_modulus, carry_modulus,
+            glwe_dimension, polynomial_size, message_modulus, carry_modulus,
            is_equal_to_num_blocks_lut_f);

        Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus));
@@ -148,12 +146,12 @@ __host__ void are_all_comparisons_block_true(
    // Applies the LUT
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
          ksks, 1, lut);
      return;
    } else {
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
          num_chunks, lut);
    }
@@ -219,12 +217,12 @@ __host__ void is_at_least_one_comparisons_block_true(
    // Applies the LUT
    if (remaining_blocks == 1) {
      // In the last iteration we copy the output to the final address
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
          ksks, 1, lut);
      return;
    } else {
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
          accumulator, bsks, ksks, num_chunks, lut);
    }
@@ -305,7 +303,7 @@ __host__ void host_compare_with_zero_equality(
    }
  }

-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
      zero_comparison);
  are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
@@ -324,7 +322,7 @@ __host__ void host_integer_radix_equality_check_kb(

  // Applies the LUT for the comparison operation
  auto comparisons = mem_ptr->tmp_block_comparisons;
-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
      bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
      eq_buffer->operator_lut->params.message_modulus);
@@ -371,7 +369,7 @@ __host__ void compare_radix_blocks_kb(

  // Apply LUT to compare to 0
  auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
      num_radix_blocks, is_non_zero_lut);

@@ -422,7 +420,7 @@ __host__ void tree_sign_reduction(
    pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
                       partial_block_count, 4);

-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
        partial_block_count >> 1, inner_tree_leaf);

@@ -462,13 +460,12 @@ __host__ void tree_sign_reduction(
    f = sign_handler_f;
  }
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
-      last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
+      streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension,
      polynomial_size, message_modulus, carry_modulus, f);
  last_lut->broadcast_lut(streams, gpu_indexes, 0);

  // Last leaf
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
      last_lut);
 }
@@ -514,7 +511,7 @@ __host__ void host_integer_radix_difference_check_kb(

    // Clean noise
    auto identity_lut = mem_ptr->identity_lut;
-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
        2 * packed_num_radix_blocks, identity_lut);

@@ -552,11 +549,11 @@ __host__ void host_integer_radix_difference_check_kb(
          packed_left + packed_num_radix_blocks * big_lwe_size;
      Torus *last_right_block_before_sign_block =
          packed_right + packed_num_radix_blocks * big_lwe_size;
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
          lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
          identity_lut);
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
          lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
          1, identity_lut);
@@ -566,7 +563,7 @@ __host__ void host_integer_radix_difference_check_kb(
          last_left_block_before_sign_block, last_right_block_before_sign_block,
          mem_ptr, bsks, ksks, 1);
      // Compare the sign block separately
-      legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -579,7 +576,7 @@ __host__ void host_integer_radix_difference_check_kb(
          streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
          lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
      // Compare the sign block separately
-      legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count,
          comparisons + (num_radix_blocks - 1) * big_lwe_size,
          lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -623,7 +620,7 @@ __host__ void host_integer_radix_maxmin_kb(
      ksks, total_num_radix_blocks);

  // Selector
-  legacy_host_integer_radix_cmux_kb<Torus>(
+  host_integer_radix_cmux_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out,
      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -50,7 +50,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
  if (array_in == array_out)
    PANIC("Cuda error: Input and output must be different");

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  auto compression_params = mem_ptr->compression_params;

  auto log_modulus = mem_ptr->storage_log_modulus;
@@ -185,7 +185,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
  if (array_in == glwe_array_out)
    PANIC("Cuda error: Input and output must be different");

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);

  auto compression_params = mem_ptr->compression_params;

--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -285,7 +285,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      // Shift the mask so that we will only keep bits we should
      uint32_t shifted_mask = full_message_mask >> shift_amount;

-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
          interesting_divisor.last_block(), bsks, ksks, 1,
          mem_ptr->masking_luts_1[shifted_mask]);
@@ -314,7 +314,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
      // the estimated degree of the output is < msg_modulus
      shifted_mask = shifted_mask & full_message_mask;

-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
          divisor_ms_blocks.first_block(), bsks, ksks, 1,
          mem_ptr->masking_luts_2[shifted_mask]);
@@ -347,7 +347,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
                           interesting_remainder1.len - 1, streams[0],
                           gpu_indexes[0]);

-      legacy_host_radix_blocks_rotate_left<Torus>(
+      host_radix_blocks_rotate_left<Torus>(
          streams, gpu_indexes, gpu_count, interesting_remainder1.data,
          tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);

@@ -402,7 +402,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    // but in that position, interesting_remainder2 always has a 0
    auto &merged_interesting_remainder = interesting_remainder1;

-    legacy_host_addition<Torus>(
+    host_addition<Torus>(
        streams[0], gpu_indexes[0], merged_interesting_remainder.data,
        merged_interesting_remainder.data, interesting_remainder2.data,
        radix_params.big_lwe_dimension, merged_interesting_remainder.len);
@@ -481,7 +481,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto create_clean_version_of_merged_remainder =
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count) {
-          legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+          integer_radix_apply_univariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -507,10 +507,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
      cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
    }

-    legacy_host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
-                                subtraction_overflowed.data,
-                                at_least_one_upper_block_is_non_zero.data,
-                                radix_params.big_lwe_dimension, 1);
+    host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
+                         subtraction_overflowed.data,
+                         at_least_one_upper_block_is_non_zero.data,
+                         radix_params.big_lwe_dimension, 1);

    int factor = (i) ? 3 : 2;
    int factor_lut_id = factor - 2;
@@ -521,7 +521,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto conditionally_zero_out_merged_interesting_remainder =
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count) {
-          legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count,
              cleaned_merged_interesting_remainder.data,
              cleaned_merged_interesting_remainder.data,
@@ -534,7 +534,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto conditionally_zero_out_merged_new_remainder =
        [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
            uint32_t gpu_count) {
-          legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+          integer_radix_apply_bivariate_lookup_table_kb<Torus>(
              streams, gpu_indexes, gpu_count, new_remainder.data,
              new_remainder.data, overflow_sum_radix.data, bsks, ksks,
              new_remainder.len,
@@ -544,7 +544,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
    auto set_quotient_bit = [&](cudaStream_t const *streams,
                                uint32_t const *gpu_indexes,
                                uint32_t gpu_count) {
-      legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+      integer_radix_apply_bivariate_lookup_table_kb<Torus>(
          streams, gpu_indexes, gpu_count, did_not_overflow.data,
          subtraction_overflowed.data,
          at_least_one_upper_block_is_non_zero.data, bsks, ksks, 1,
@@ -552,7 +552,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
          mem_ptr->merge_overflow_flags_luts[pos_in_block]
              ->params.message_modulus);

-      legacy_host_addition<Torus>(
+      host_addition<Torus>(
          streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
          &quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
          radix_params.big_lwe_dimension, 1);
@@ -588,17 +588,17 @@ __host__ void host_unsigned_integer_div_rem_kb(

  // Clean the quotient and remainder
  // as even though they have no carries, they are not at nominal noise level
-  legacy_host_addition<Torus>(streams[0], gpu_indexes[0], remainder,
-                              remainder1.data, remainder2.data,
-                              radix_params.big_lwe_dimension, remainder1.len);
+  host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
+                       remainder2.data, radix_params.big_lwe_dimension,
+                       remainder1.len);

  for (uint j = 0; j < gpu_count; j++) {
    cuda_synchronize_stream(streams[j], gpu_indexes[j]);
  }
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
      bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
      ksks, num_blocks, mem_ptr->message_extract_lut_2);
  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
@@ -636,14 +636,12 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
      cuda_synchronize_stream(streams[j], gpu_indexes[j]);
    }

-    legacy_host_integer_abs_kb_async<Torus>(
-        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
-        positive_numerator.data, bsks, ksks, int_mem_ptr->abs_mem_1, true,
-        num_blocks);
-    legacy_host_integer_abs_kb_async<Torus>(
-        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
-        positive_divisor.data, bsks, ksks, int_mem_ptr->abs_mem_2, true,
-        num_blocks);
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, gpu_indexes,
+                               gpu_count, positive_numerator.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_1, true, num_blocks);
+    host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, gpu_indexes,
+                               gpu_count, positive_divisor.data, bsks, ksks,
+                               int_mem_ptr->abs_mem_2, true, num_blocks);
    for (uint j = 0; j < int_mem_ptr->active_gpu_count; j++) {
      cuda_synchronize_stream(int_mem_ptr->sub_streams_1[j], gpu_indexes[j]);
      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
@@ -654,7 +652,7 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
        positive_numerator.data, positive_divisor.data, bsks, ksks,
        int_mem_ptr->unsigned_mem, num_blocks);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
        int_mem_ptr->sign_bits_are_different,
        &numerator[big_lwe_size * (num_blocks - 1)],
@@ -667,36 +665,36 @@ __host__ void host_integer_div_rem_kb(cudaStream_t const *streams,
      cuda_synchronize_stream(int_mem_ptr->sub_streams_2[j], gpu_indexes[j]);
    }

-    legacy_host_integer_radix_negation(
+    host_integer_radix_negation(
        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
        int_mem_ptr->negated_quotient, quotient, radix_params.big_lwe_dimension,
        num_blocks, radix_params.message_modulus, radix_params.carry_modulus);

    uint32_t requested_flag = outputFlag::FLAG_NONE;
    uint32_t uses_carry = 0;
-    legacy_host_propagate_single_carry<Torus>(
+    host_propagate_single_carry<Torus>(
        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
        int_mem_ptr->negated_quotient, nullptr, nullptr, int_mem_ptr->scp_mem_1,
        bsks, ksks, num_blocks, requested_flag, uses_carry);

-    legacy_host_integer_radix_negation(
-        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
-        int_mem_ptr->negated_remainder, remainder,
-        radix_params.big_lwe_dimension, num_blocks,
-        radix_params.message_modulus, radix_params.carry_modulus);
+    host_integer_radix_negation(int_mem_ptr->sub_streams_2, gpu_indexes,
+                                gpu_count, int_mem_ptr->negated_remainder,
+                                remainder, radix_params.big_lwe_dimension,
+                                num_blocks, radix_params.message_modulus,
+                                radix_params.carry_modulus);

-    legacy_host_propagate_single_carry<Torus>(
+    host_propagate_single_carry<Torus>(
        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
        int_mem_ptr->negated_remainder, nullptr, nullptr,
        int_mem_ptr->scp_mem_2, bsks, ksks, num_blocks, requested_flag,
        uses_carry);

-    legacy_host_integer_radix_cmux_kb<Torus>(
+    host_integer_radix_cmux_kb<Torus>(
        int_mem_ptr->sub_streams_1, gpu_indexes, gpu_count, quotient,
        int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
        quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks, num_blocks);

-    legacy_host_integer_radix_cmux_kb<Torus>(
+    host_integer_radix_cmux_kb<Torus>(
        int_mem_ptr->sub_streams_2, gpu_indexes, gpu_count, remainder,
        &numerator[big_lwe_size * (num_blocks - 1)],
        int_mem_ptr->negated_remainder, remainder,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
@@ -106,27 +106,32 @@ void scratch_cuda_integer_overflowing_sub_kb_64_inplace(

 void cuda_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI *carry_out,
-    const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
+    void *lwe_array, void *carry_out, const void *carry_in, int8_t *mem_ptr,
+    void *const *bsks, void *const *ksks, uint32_t num_blocks,
+    uint32_t requested_flag, uint32_t uses_carry) {

  host_propagate_single_carry<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, carry_out,
-      carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), requested_flag, uses_carry);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(carry_out),
+      static_cast<const uint64_t *>(carry_in),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks, requested_flag, uses_carry);
 }

 void cuda_add_and_propagate_single_carry_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
-    CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks,
-    uint32_t requested_flag, uint32_t uses_carry) {
+    void *lhs_array, const void *rhs_array, void *carry_out,
+    const void *carry_in, int8_t *mem_ptr, void *const *bsks, void *const *ksks,
+    uint32_t num_blocks, uint32_t requested_flag, uint32_t uses_carry) {

  host_add_and_propagate_single_carry<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
-      carry_out, carry_in, (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), requested_flag, uses_carry);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lhs_array),
+      static_cast<const uint64_t *>(rhs_array),
+      static_cast<uint64_t *>(carry_out),
+      static_cast<const uint64_t *>(carry_in),
+      (int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks, requested_flag, uses_carry);
 }

 void cuda_integer_overflowing_sub_kb_64_inplace(
@@ -179,7 +184,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -190,7 +195,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      lut_degree, allocate_gpu_memory);
+      allocate_gpu_memory);
 }

 void scratch_cuda_apply_many_univariate_lut_kb_64(
@@ -200,7 +205,7 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory) {
+    uint32_t num_many_lut, bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -211,19 +216,22 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      num_many_lut, lut_degree, allocate_gpu_memory);
+      num_many_lut, allocate_gpu_memory);
 }

-void cuda_apply_univariate_lut_kb_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks) {
+void cuda_apply_univariate_lut_kb_64(void *const *streams,
+                                     uint32_t const *gpu_indexes,
+                                     uint32_t gpu_count, void *output_radix_lwe,
+                                     void const *input_radix_lwe,
+                                     int8_t *mem_ptr, void *const *ksks,
+                                     void *const *bsks, uint32_t num_blocks) {

  host_apply_univariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
-      bsks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<const uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
+      num_blocks);
 }

 void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
@@ -236,15 +244,16 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,

 void cuda_apply_many_univariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_many_lut,
-    uint32_t lut_stride) {
+    void *output_radix_lwe, void const *input_radix_lwe, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks,
+    uint32_t num_many_lut, uint32_t lut_stride) {

  host_apply_many_univariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
-      bsks, num_many_lut, lut_stride);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<const uint64_t *>(input_radix_lwe),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
+      num_many_lut, lut_stride);
 }

 void scratch_cuda_apply_bivariate_lut_kb_64(
@@ -254,7 +263,7 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -265,21 +274,22 @@ void scratch_cuda_apply_bivariate_lut_kb_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      lut_degree, allocate_gpu_memory);
+      allocate_gpu_memory);
 }

 void cuda_apply_bivariate_lut_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI const *input_radix_lwe_1,
-    CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
-    uint32_t shift) {
+    void *output_radix_lwe, void const *input_radix_lwe_1,
+    void const *input_radix_lwe_2, int8_t *mem_ptr, void *const *ksks,
+    void *const *bsks, uint32_t num_blocks, uint32_t shift) {

  host_apply_bivariate_lut_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      input_radix_lwe_1, input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
-      (uint64_t **)(ksks), bsks, num_radix_blocks, shift);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<const uint64_t *>(input_radix_lwe_1),
+      static_cast<const uint64_t *>(input_radix_lwe_2),
+      (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
+      shift);
 }

 void cleanup_cuda_apply_bivariate_lut_kb_64(void *const *streams,
@@ -297,7 +307,7 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
    uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
    uint32_t grouping_factor, uint32_t num_radix_blocks,
    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint64_t lut_degree, bool allocate_gpu_memory) {
+    bool allocate_gpu_memory) {

  int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
                          glwe_dimension * polynomial_size, lwe_dimension,
@@ -308,21 +318,22 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
      (int_radix_lut<uint64_t> **)mem_ptr,
      static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
-      lut_degree, allocate_gpu_memory);
+      allocate_gpu_memory);
 }

 void cuda_integer_compute_prefix_sum_hillis_steele_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *output_radix_lwe,
-    CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
-    void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
+    void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
+    void *const *ksks, void *const *bsks, uint32_t num_blocks, uint32_t shift) {

  int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;

  host_compute_prefix_sum_hillis_steele<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
-      generates_or_propagates, params, (int_radix_lut<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks), num_radix_blocks);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(output_radix_lwe),
+      static_cast<uint64_t *>(generates_or_propagates), params,
+      (int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      num_blocks);
 }

 void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
@@ -334,9 +345,11 @@ void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(

 void cuda_integer_reverse_blocks_64_inplace(void *const *streams,
                                            uint32_t const *gpu_indexes,
-                                            uint32_t gpu_count,
-                                            CudaRadixCiphertextFFI *lwe_array) {
+                                            uint32_t gpu_count, void *lwe_array,
+                                            uint32_t num_blocks,
+                                            uint32_t lwe_size) {

-  host_radix_blocks_reverse_inplace<uint64_t>((cudaStream_t *)(streams),
-                                              gpu_indexes, lwe_array);
+  host_radix_blocks_reverse_inplace<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes,
+      static_cast<uint64_t *>(lwe_array), num_blocks, lwe_size);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -228,9 +228,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
                                 streams[0], gpu_indexes[0]);
  }
  if (num_radix_in_vec == 2) {
-    legacy_host_addition<Torus>(
-        streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-        &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
+    host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                         &old_blocks[num_blocks * big_lwe_size],
+                         big_lwe_dimension, num_blocks);
    return;
  }

@@ -280,13 +280,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(

  // generate accumulators
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], message_acc,
-      luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
-      glwe_dimension, polynomial_size, message_modulus, carry_modulus,
-      lut_f_message);
+      streams[0], gpu_indexes[0], message_acc, glwe_dimension, polynomial_size,
+      message_modulus, carry_modulus, lut_f_message);
  generate_device_accumulator<Torus>(
-      streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
-      luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
+      streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size,
      message_modulus, carry_modulus, lut_f_carry);
  luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);

@@ -297,7 +294,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
      ch_amount++;
    dim3 add_grid(ch_amount, num_blocks, 1);

-    cuda_set_device(gpu_indexes[0]);
+    cudaSetDevice(gpu_indexes[0]);
    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
        new_blocks, old_blocks, min(r, chunk_size), big_lwe_size, num_blocks);

@@ -448,9 +445,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
  luts_message_carry->release(streams, gpu_indexes, gpu_count);
  delete (luts_message_carry);

-  legacy_host_addition<Torus>(
-      streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
-      &old_blocks[num_blocks * big_lwe_size], big_lwe_dimension, num_blocks);
+  host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
+                       &old_blocks[num_blocks * big_lwe_size],
+                       big_lwe_dimension, num_blocks);
 }

 template <typename Torus, class params>
@@ -541,13 +538,13 @@ __host__ void host_integer_mult_radix_kb(
  dim3 grid(lsb_vector_block_count, 1, 1);
  dim3 thds(params::degree / params::opt, 1, 1);

-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  all_shifted_lhs_rhs<Torus, params><<<grid, thds, 0, streams[0]>>>(
      radix_lwe_left, vector_result_lsb, vector_result_msb, radix_lwe_right,
      vector_lsb_rhs, vector_msb_rhs, num_blocks);
  check_cuda_error(cudaGetLastError());

-  legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+  integer_radix_apply_bivariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, block_mul_res, block_mul_res,
      vector_result_sb, bsks, ksks, total_block_count, luts_array,
      luts_array->params.message_modulus);
@@ -556,7 +553,7 @@ __host__ void host_integer_mult_radix_kb(
  vector_result_msb = &block_mul_res[lsb_vector_block_count *
                                     (polynomial_size * glwe_dimension + 1)];

-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  fill_radix_from_lsb_msb<Torus, params>
      <<<num_blocks * num_blocks, params::degree / params::opt, 0,
         streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
@@ -587,7 +584,7 @@ __host__ void host_integer_mult_radix_kb(
  auto scp_mem_ptr = mem_ptr->sc_prop_mem;
  uint32_t requested_flag = outputFlag::FLAG_NONE;
  uint32_t uses_carry = 0;
-  legacy_host_propagate_single_carry<Torus>(
+  host_propagate_single_carry<Torus>(
      streams, gpu_indexes, gpu_count, radix_lwe_out, nullptr, nullptr,
      scp_mem_ptr, bsks, ksks, num_blocks, requested_flag, uses_carry);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cu
@@ -2,11 +2,13 @@

 void cuda_negate_integer_radix_ciphertext_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_in, uint32_t message_modulus,
+    void *lwe_array_out, void const *lwe_array_in, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus) {

-  host_integer_radix_negation<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
-                                        gpu_count, lwe_array_out, lwe_array_in,
-                                        message_modulus, carry_modulus);
+  host_integer_radix_negation<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_in), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/negation.cuh
@@ -54,69 +54,12 @@ device_integer_radix_negation(Torus *output, Torus const *input,
 }

 template <typename Torus>
-__host__ void
-host_integer_radix_negation(cudaStream_t const *streams,
-                            uint32_t const *gpu_indexes, uint32_t gpu_count,
-                            CudaRadixCiphertextFFI *lwe_array_out,
-                            CudaRadixCiphertextFFI const *lwe_array_in,
-                            uint64_t message_modulus, uint64_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
-
-  if (lwe_array_out->num_radix_blocks != lwe_array_in->num_radix_blocks)
-    PANIC("Cuda error: lwe_array_in and lwe_array_out num radix blocks must be "
-          "the same")
-
-  if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
-    PANIC("Cuda error: lwe_array_in and lwe_array_out lwe_dimension must be "
-          "the same")
-
-  auto num_radix_blocks = lwe_array_out->num_radix_blocks;
-  auto lwe_dimension = lwe_array_out->lwe_dimension;
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_negation<Torus><<<grid, thds, 0, streams[0]>>>(
-      static_cast<Torus *>(lwe_array_out->ptr),
-      static_cast<Torus *>(lwe_array_in->ptr), num_radix_blocks, lwe_dimension,
-      message_modulus, delta);
-  check_cuda_error(cudaGetLastError());
-
-  uint8_t zb = 0;
-  for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
-    auto input_degree = lwe_array_in->degrees[i];
-
-    if (zb != 0) {
-      input_degree += static_cast<uint64_t>(zb);
-    }
-    Torus z =
-        std::max(static_cast<Torus>(1),
-                 static_cast<Torus>(ceil(input_degree / message_modulus))) *
-        message_modulus;
-
-    lwe_array_out->degrees[i] = z - static_cast<uint64_t>(zb);
-    lwe_array_out->noise_levels[i] = lwe_array_in->noise_levels[i];
-    zb = z / message_modulus;
-  }
-}
-template <typename Torus>
-__host__ void legacy_host_integer_radix_negation(
+__host__ void host_integer_radix_negation(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, Torus *output, Torus const *input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint64_t message_modulus, uint64_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cu
@@ -1,10 +0,0 @@
-#include "radix_ciphertext.cuh"
-
-void release_radix_ciphertext(cudaStream_t const stream,
-                              uint32_t const gpu_index,
-                              CudaRadixCiphertextFFI *data) {
-  cuda_drop_async(data->ptr, stream, gpu_index);
-  free(data->degrees);
-  free(data->noise_levels);
-  cuda_synchronize_stream(stream, gpu_index);
-}
--- a/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/radix_ciphertext.cuh
@@ -1,142 +0,0 @@
-#ifndef CUDA_INTEGER_RADIX_CIPHERTEXT_CUH
-#define CUDA_INTEGER_RADIX_CIPHERTEXT_CUH
-
-#include "device.h"
-#include "integer/integer.h"
-
-template <typename Torus>
-void create_zero_radix_ciphertext_async(cudaStream_t const stream,
-                                        uint32_t const gpu_index,
-                                        CudaRadixCiphertextFFI *radix,
-                                        const uint32_t num_radix_blocks,
-                                        const uint32_t lwe_dimension) {
-  radix->lwe_dimension = lwe_dimension;
-  radix->num_radix_blocks = num_radix_blocks;
-  uint32_t size = (lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
-  radix->ptr = (void *)cuda_malloc_async(size, stream, gpu_index);
-  cuda_memset_async(radix->ptr, 0, size, stream, gpu_index);
-
-  radix->degrees = (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t)));
-  radix->noise_levels =
-      (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t)));
-  if (radix->degrees == NULL || radix->noise_levels == NULL) {
-    PANIC("Cuda error: degrees / noise levels not allocated correctly")
-  }
-}
-
-// end_input_lwe_index is exclusive
-template <typename Torus>
-void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
-                               const CudaRadixCiphertextFFI *input_radix,
-                               const uint32_t start_input_lwe_index,
-                               const uint32_t end_input_lwe_index) {
-  if (input_radix->num_radix_blocks <
-      end_input_lwe_index - start_input_lwe_index)
-    PANIC("Cuda error: input radix should have more blocks than the specified "
-          "range")
-  if (start_input_lwe_index >= end_input_lwe_index)
-    PANIC("Cuda error: slice range should be non negative")
-
-  auto lwe_size = input_radix->lwe_dimension + 1;
-  output_radix->num_radix_blocks = end_input_lwe_index - start_input_lwe_index;
-  output_radix->lwe_dimension = input_radix->lwe_dimension;
-  Torus *in_ptr = (Torus *)input_radix->ptr;
-  output_radix->ptr = (void *)(in_ptr + start_input_lwe_index * lwe_size);
-  output_radix->degrees = input_radix->degrees + start_input_lwe_index;
-  output_radix->noise_levels =
-      input_radix->noise_levels + start_input_lwe_index;
-}
-
-// end_lwe_index are exclusive
-template <typename Torus>
-void copy_radix_ciphertext_slice_async(
-    cudaStream_t const stream, uint32_t const gpu_index,
-    CudaRadixCiphertextFFI *output_radix, const uint32_t output_start_lwe_index,
-    const uint32_t output_end_lwe_index,
-    const CudaRadixCiphertextFFI *input_radix,
-    const uint32_t input_start_lwe_index, const uint32_t input_end_lwe_index) {
-  if (output_radix->lwe_dimension != input_radix->lwe_dimension)
-    PANIC("Cuda error: input lwe dimension should be equal to output lwe "
-          "dimension")
-  if (output_end_lwe_index - output_start_lwe_index !=
-      input_end_lwe_index - input_start_lwe_index)
-    PANIC("Cuda error: output and input ranges should have the same size")
-  if (output_end_lwe_index - output_start_lwe_index >
-      output_radix->num_radix_blocks)
-    PANIC("Cuda error: output range should be lower or equal to output num "
-          "blocks")
-  if (input_end_lwe_index - input_start_lwe_index >
-      input_radix->num_radix_blocks)
-    PANIC(
-        "Cuda error: input range should be lower or equal to input num blocks")
-  if (output_end_lwe_index - output_start_lwe_index <= 0)
-    PANIC("Cuda error: output range should be greater than zero")
-  if (input_end_lwe_index - input_start_lwe_index <= 0)
-    PANIC("Cuda error: input range should be greater than zero")
-  if (output_end_lwe_index <= output_start_lwe_index)
-    PANIC("Cuda error: output end index should be greater or equal to output "
-          "start index")
-  if (input_end_lwe_index <= input_start_lwe_index)
-    PANIC("Cuda error: input end index should be greater or equal to input "
-          "start index")
-  if (output_start_lwe_index > output_radix->num_radix_blocks)
-    PANIC("Cuda error: output start index should be smaller than the number "
-          "of blocks")
-  if (input_start_lwe_index > input_radix->num_radix_blocks)
-    PANIC("Cuda error: input start index should be smaller than the number "
-          "of blocks")
-
-  auto lwe_size = input_radix->lwe_dimension + 1;
-  Torus *out_ptr = (Torus *)output_radix->ptr;
-  out_ptr = &out_ptr[output_start_lwe_index * lwe_size];
-  Torus *in_ptr = (Torus *)input_radix->ptr;
-  in_ptr = &in_ptr[input_start_lwe_index * lwe_size];
-  auto num_blocks = input_end_lwe_index - input_start_lwe_index;
-
-  cuda_memcpy_async_gpu_to_gpu(out_ptr, in_ptr,
-                               num_blocks * lwe_size * sizeof(Torus), stream,
-                               gpu_index);
-  for (uint i = 0; i < num_blocks; i++) {
-    output_radix->degrees[i + output_start_lwe_index] =
-        input_radix->degrees[i + input_start_lwe_index];
-    output_radix->noise_levels[i + output_start_lwe_index] =
-        input_radix->noise_levels[i + input_start_lwe_index];
-  }
-}
-
-template <typename Torus>
-void copy_radix_ciphertext_async(cudaStream_t const stream,
-                                 uint32_t const gpu_index,
-                                 CudaRadixCiphertextFFI *output_radix,
-                                 const CudaRadixCiphertextFFI *input_radix) {
-  copy_radix_ciphertext_slice_async<Torus>(
-      stream, gpu_index, output_radix, 0, output_radix->num_radix_blocks,
-      input_radix, 0, input_radix->num_radix_blocks);
-}
-
-// end_lwe_index is exclusive
-template <typename Torus>
-void set_zero_radix_ciphertext_slice_async(cudaStream_t const stream,
-                                           uint32_t const gpu_index,
-                                           CudaRadixCiphertextFFI *radix,
-                                           const uint32_t start_lwe_index,
-                                           const uint32_t end_lwe_index) {
-  if (radix->num_radix_blocks < end_lwe_index - start_lwe_index)
-    PANIC("Cuda error: input radix should have more blocks than the specified "
-          "range")
-  if (start_lwe_index >= end_lwe_index)
-    PANIC("Cuda error: slice range should be non negative")
-
-  auto lwe_size = radix->lwe_dimension + 1;
-  auto num_blocks_to_set = end_lwe_index - start_lwe_index;
-  auto lwe_array_out_block = (Torus *)radix->ptr + start_lwe_index * lwe_size;
-  cuda_memset_async(lwe_array_out_block, 0,
-                    num_blocks_to_set * lwe_size * sizeof(Torus), stream,
-                    gpu_index);
-  memset(&radix->degrees[start_lwe_index], 0,
-         num_blocks_to_set * sizeof(uint64_t));
-  memset(&radix->noise_levels[start_lwe_index], 0,
-         num_blocks_to_set * sizeof(uint64_t));
-}
-
-#endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
@@ -2,11 +2,13 @@

 void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
-    uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {
+    void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
+    uint32_t lwe_ciphertext_count, uint32_t message_modulus,
+    uint32_t carry_modulus) {

  host_integer_radix_scalar_addition_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
-      static_cast<const uint64_t *>(scalar_input), num_scalars, message_modulus,
-      carry_modulus);
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array),
+      static_cast<const uint64_t *>(scalar_input), lwe_dimension,
+      lwe_ciphertext_count, message_modulus, carry_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -24,12 +24,12 @@ __global__ void device_integer_radix_scalar_addition_inplace(
 }

 template <typename Torus>
-__host__ void legacy_host_integer_radix_scalar_addition_inplace(
+__host__ void host_integer_radix_scalar_addition_inplace(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -49,42 +49,6 @@ __host__ void legacy_host_integer_radix_scalar_addition_inplace(
                                      delta);
  check_cuda_error(cudaGetLastError());
 }
-template <typename Torus>
-__host__ void host_integer_radix_scalar_addition_inplace(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
-    Torus const *scalar_input, uint32_t num_scalars, uint32_t message_modulus,
-    uint32_t carry_modulus) {
-  if (lwe_array->num_radix_blocks < num_scalars)
-    PANIC("Cuda error: num scalars should be smaller or equal to input num "
-          "radix blocks")
-  cuda_set_device(gpu_indexes[0]);
-
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_scalars;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  // Value of the shift we multiply our messages by
-  // If message_modulus and carry_modulus are always powers of 2 we can simplify
-  // this
-  uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
-
-  device_integer_radix_scalar_addition_inplace<Torus>
-      <<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr, scalar_input,
-                                      num_scalars, lwe_array->lwe_dimension,
-                                      delta);
-  check_cuda_error(cudaGetLastError());
-  Torus scalar_input_cpu[num_scalars];
-  cuda_memcpy_async_to_cpu(&scalar_input_cpu, scalar_input,
-                           num_scalars * sizeof(Torus), streams[0],
-                           gpu_indexes[0]);
-  for (uint i = 0; i < num_scalars; i++) {
-    lwe_array->degrees[i] = lwe_array->degrees[i] + scalar_input_cpu[i];
-  }
-}

 template <typename Torus>
 __global__ void device_integer_radix_add_scalar_one_inplace(
@@ -104,7 +68,7 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
    uint32_t gpu_count, Torus *lwe_array, uint32_t lwe_dimension,
    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
@@ -144,7 +108,7 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
    uint32_t gpu_count, Torus *lwe_array, Torus *scalar_input,
    uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
    uint32_t message_modulus, uint32_t carry_modulus) {
-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);

  // Create a 1-dimensional grid of threads
  int num_blocks = 0, num_threads = 0;
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cu
@@ -2,58 +2,15 @@

 void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array_out,
-    CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
+    void *lwe_array_out, void const *lwe_array_input, void const *clear_blocks,
    uint32_t num_clear_blocks, int8_t *mem_ptr, void *const *bsks,
-    void *const *ksks) {
+    void *const *ksks, uint32_t lwe_ciphertext_count, BITOP_TYPE op) {

  host_integer_radix_scalar_bitop_kb<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
-      lwe_array_input, static_cast<const uint64_t *>(clear_blocks),
-      num_clear_blocks, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
-}
-
-void update_degrees_after_scalar_bitand(uint64_t *output_degrees,
-                                        uint64_t *clear_degrees,
-                                        uint64_t *input_degrees,
-                                        uint32_t num_clear_blocks) {
-  for (uint i = 0; i < num_clear_blocks; i++) {
-    output_degrees[i] = std::min(clear_degrees[i], input_degrees[i]);
-  }
-}
-void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
-                                       uint64_t *clear_degrees,
-                                       uint64_t *input_degrees,
-                                       uint32_t num_clear_blocks) {
-  for (uint i = 0; i < num_clear_blocks; i++) {
-    auto max = std::max(clear_degrees[i], input_degrees[i]);
-    auto min = std::min(clear_degrees[i], input_degrees[i]);
-    auto result = max;
-
-    for (uint j = 0; j < min + 1; j++) {
-      if (max | j > result) {
-        result = max | j;
-      }
-    }
-    output_degrees[i] = result;
-  }
-}
-void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,
-                                        uint64_t *clear_degrees,
-                                        uint64_t *input_degrees,
-                                        uint32_t num_clear_blocks) {
-  for (uint i = 0; i < num_clear_blocks; i++) {
-    auto max = std::max(clear_degrees[i], input_degrees[i]);
-    auto min = std::min(clear_degrees[i], input_degrees[i]);
-    auto result = max;
-
-    // Try every possibility to find the worst case
-    for (uint j = 0; j < min + 1; j++) {
-      if (max ^ j > result) {
-        result = max ^ j;
-      }
-    }
-    output_degrees[i] = result;
-  }
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<const uint64_t *>(lwe_array_input),
+      static_cast<const uint64_t *>(clear_blocks), num_clear_blocks,
+      (int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
+      lwe_ciphertext_count, op);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -7,60 +7,45 @@
 template <typename Torus>
 __host__ void host_integer_radix_scalar_bitop_kb(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
-    uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
-    void *const *bsks, Torus *const *ksks) {
+    uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_input,
+    Torus const *clear_blocks, uint32_t num_clear_blocks,
+    int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
+    uint32_t num_radix_blocks, BITOP_TYPE op) {

-  if (output->num_radix_blocks != input->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be equal")
-  if (output->lwe_dimension != input->lwe_dimension)
-    PANIC("Cuda error: input and output num radix blocks must be equal")
  auto lut = mem_ptr->lut;
-  auto op = mem_ptr->op;
-  auto num_radix_blocks = output->num_radix_blocks;
+  auto params = lut->params;
+  auto big_lwe_dimension = params.big_lwe_dimension;
+
+  uint32_t lwe_size = big_lwe_dimension + 1;

  if (num_clear_blocks == 0) {
    if (op == SCALAR_BITAND) {
-      set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                                   output, 0, num_radix_blocks);
+      cuda_memset_async(lwe_array_out, 0,
+                        num_radix_blocks * lwe_size * sizeof(Torus), streams[0],
+                        gpu_indexes[0]);
    } else {
-      if (input != output)
-        copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
-                                           input);
+      cuda_memcpy_async_gpu_to_gpu(lwe_array_out, lwe_array_input,
+                                   num_radix_blocks * lwe_size * sizeof(Torus),
+                                   streams[0], gpu_indexes[0]);
    }
  } else {
    // We have all possible LUTs pre-computed and we use the decomposed scalar
    // as index to recover the right one
-    uint64_t degrees[num_clear_blocks];
-    uint64_t clear_degrees[num_clear_blocks];
-    cuda_memcpy_async_to_cpu(&clear_degrees, clear_blocks,
-                             num_clear_blocks * sizeof(Torus), streams[0],
-                             gpu_indexes[0]);
-    if (mem_ptr->op == BITOP_TYPE::SCALAR_BITAND) {
-      update_degrees_after_scalar_bitand(degrees, clear_degrees, input->degrees,
-                                         num_clear_blocks);
-    } else if (mem_ptr->op == BITOP_TYPE::SCALAR_BITOR) {
-      update_degrees_after_scalar_bitor(degrees, clear_degrees, input->degrees,
-                                        num_clear_blocks);
-    } else if (mem_ptr->op == SCALAR_BITXOR) {
-      update_degrees_after_scalar_bitxor(degrees, clear_degrees, input->degrees,
-                                         num_clear_blocks);
-    }
    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                 num_clear_blocks * sizeof(Torus), streams[0],
                                 gpu_indexes[0]);
    lut->broadcast_lut(streams, gpu_indexes, 0);

    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, output, input, bsks, ksks, lut,
-        num_clear_blocks);
-    memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
+        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks,
+        ksks, num_clear_blocks, lut);

    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
-      set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                                   output, num_clear_blocks,
-                                                   num_radix_blocks);
+      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
+      cuda_memset_async(lwe_array_out_block, 0,
+                        (num_radix_blocks - num_clear_blocks) * lwe_size *
+                            sizeof(Torus),
+                        streams[0], gpu_indexes[0]);
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -43,7 +43,7 @@ __host__ void scalar_compare_radix_blocks_kb(

  // Apply LUT to compare to 0
  auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
-  legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+  integer_radix_apply_univariate_lookup_table_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
      ksks, num_radix_blocks, sign_lut);

@@ -110,13 +110,13 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
    };

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
-    generate_device_accumulator<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
-        lut->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_last_leaf_lut_f);
+    generate_device_accumulator<Torus>(streams[0], gpu_indexes[0],
+                                       lut->get_lut(0, 0), glwe_dimension,
+                                       polynomial_size, message_modulus,
+                                       carry_modulus, scalar_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out,
        mem_ptr->tmp_lwe_array_out, bsks, ksks, 1, lut);

@@ -195,12 +195,12 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(

    auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
-        lut->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
        lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -331,12 +331,12 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(

    auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut;
    generate_device_accumulator_bivariate<Torus>(
-        streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
-        lut->get_max_degree(0), glwe_dimension, polynomial_size,
-        message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f);
+        streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension,
+        polynomial_size, message_modulus, carry_modulus,
+        scalar_bivariate_last_leaf_lut_f);
    lut->broadcast_lut(streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
        sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);

@@ -426,13 +426,12 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
    auto signed_msb_lut = mem_ptr->signed_msb_lut;
    generate_device_accumulator_bivariate<Torus>(
        msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0),
-        signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, lut_f);
    signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);

    Torus const *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
        are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
        signed_msb_lut->params.message_modulus);
@@ -492,7 +491,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
        msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
        big_lwe_dimension, 1, 1, message_modulus, carry_modulus);

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
        encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
        mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
@@ -541,10 +540,10 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(

  // Selector
  // CMUX for Max or Min
-  legacy_host_integer_radix_cmux_kb<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, sign, lwe_array_left,
-      lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
-      total_num_radix_blocks);
+  host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
+                                    lwe_array_out, sign, lwe_array_left,
+                                    lwe_array_right, mem_ptr->cmux_buffer, bsks,
+                                    ksks, total_num_radix_blocks);
 }

 template <typename Torus>
@@ -622,7 +621,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(

  // Selector
  // CMUX for Max or Min
-  legacy_host_integer_radix_cmux_kb<Torus>(
+  host_integer_radix_cmux_kb<Torus>(
      streams, gpu_indexes, gpu_count, lwe_array_out,
      mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
      mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
@@ -686,7 +685,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
                                 lsb_streams[0], gpu_indexes[0]);
    scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);

-    legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
        bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
  }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -47,6 +47,9 @@ __host__ void host_integer_scalar_mul_radix(
    void *const *bsks, T *const *ksks, uint32_t input_lwe_dimension,
    uint32_t message_modulus, uint32_t num_radix_blocks, uint32_t num_scalars) {

+  if (num_radix_blocks == 0 | num_scalars == 0)
+    return;
+
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  uint32_t lwe_size = input_lwe_dimension + 1;
@@ -80,7 +83,7 @@ __host__ void host_integer_scalar_mul_radix(
          preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
      T *block_shift_buffer =
          all_shifted_buffer + j * num_radix_blocks * lwe_size;
-      legacy_host_radix_blocks_rotate_right<T>(
+      host_radix_blocks_rotate_right<T>(
          streams, gpu_indexes, gpu_count, block_shift_buffer,
          preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
      // create trivial assign for value = 0
@@ -114,7 +117,7 @@ __host__ void host_integer_scalar_mul_radix(
    auto scp_mem_ptr = mem->sc_prop_mem;
    uint32_t requested_flag = outputFlag::FLAG_NONE;
    uint32_t uses_carry = 0;
-    legacy_host_propagate_single_carry<T>(
+    host_propagate_single_carry<T>(
        streams, gpu_indexes, gpu_count, lwe_array, nullptr, nullptr,
        scp_mem_ptr, bsks, ksks, num_radix_blocks, requested_flag, uses_carry);
  }
@@ -122,12 +125,12 @@ __host__ void host_integer_scalar_mul_radix(

 // Small scalar_mul is used in shift/rotate
 template <typename T>
-__host__ void host_legacy_integer_small_scalar_mul_radix(
+__host__ void host_integer_small_scalar_mul_radix(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
    uint32_t gpu_count, T *output_lwe_array, T *input_lwe_array, T scalar,
    uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_indexes[0]);
+  cudaSetDevice(gpu_indexes[0]);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -143,42 +146,4 @@ __host__ void host_legacy_integer_small_scalar_mul_radix(
      input_lwe_ciphertext_count);
  check_cuda_error(cudaGetLastError());
 }
-
-// Small scalar_mul is used in shift/rotate
-template <typename T>
-__host__ void host_integer_small_scalar_mul_radix(
-    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *output_lwe_array,
-    CudaRadixCiphertextFFI *input_lwe_array, T scalar) {
-
-  if (output_lwe_array->num_radix_blocks != input_lwe_array->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  if (output_lwe_array->lwe_dimension != input_lwe_array->lwe_dimension)
-    PANIC("Cuda error: input and output lwe_dimension must be the same")
-
-  cuda_set_device(gpu_indexes[0]);
-  auto lwe_dimension = input_lwe_array->lwe_dimension;
-  auto num_radix_blocks = input_lwe_array->num_radix_blocks;
-
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_radix_blocks * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  device_small_scalar_radix_multiplication<<<grid, thds, 0, streams[0]>>>(
-      (T *)output_lwe_array->ptr, (T *)input_lwe_array->ptr, scalar,
-      lwe_dimension, num_radix_blocks);
-  check_cuda_error(cudaGetLastError());
-
-  for (int i = 0; i < num_radix_blocks; i++) {
-    output_lwe_array->noise_levels[i] =
-        input_lwe_array->noise_levels[i] * scalar;
-    output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
-  }
-}
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
@@ -57,9 +57,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
  // one block is responsible to process single lwe ciphertext
  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    legacy_host_radix_blocks_rotate_right<Torus>(
-        streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
-        num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -71,22 +71,22 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    legacy_host_radix_blocks_rotate_right<Torus>(
-        streams, gpu_indexes, gpu_count, giver_blocks, lwe_array, 1, num_blocks,
-        big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          giver_blocks, lwe_array, 1,
+                                          num_blocks, big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
        lut_bivariate->params.message_modulus);

  } else {
    // rotate left as the blocks are from LSB to MSB
-    legacy_host_radix_blocks_rotate_left<Torus>(
-        streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
-        num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
@@ -98,13 +98,13 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

    auto receiver_blocks = lwe_array;
    auto giver_blocks = rotated_buffer;
-    legacy_host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
-                                                giver_blocks, lwe_array, 1,
-                                                num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         giver_blocks, lwe_array, 1, num_blocks,
+                                         big_lwe_size);

    auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
        giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
        lut_bivariate->params.message_modulus);
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  if (mem->shift_type == LEFT_SHIFT) {
    // rotate right as the blocks are from LSB to MSB
-    legacy_host_radix_blocks_rotate_right<Torus>(
-        streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
-        num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
+                                          rotated_buffer, lwe_array, rotations,
+                                          num_blocks, big_lwe_size);

    // create trivial assign for value = 0
    cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
@@ -75,7 +75,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    size_t partial_block_count = num_blocks - rotations;

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, partial_current_blocks,
        partial_current_blocks, partial_previous_blocks, bsks, ksks,
        partial_block_count, lut_bivariate,
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

  } else {
    // right shift
-    legacy_host_radix_blocks_rotate_left<Torus>(
-        streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
-        num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);

    // rotate left as the blocks are from LSB to MSB
    // create trivial assign for value = 0
@@ -106,7 +106,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

    size_t partial_block_count = num_blocks - rotations;

-    legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+    integer_radix_apply_bivariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, partial_current_blocks,
        partial_current_blocks, partial_next_blocks, bsks, ksks,
        partial_block_count, lut_bivariate,
@@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
  Torus *last_block_copy = &padding_block[big_lwe_size];

  if (mem->shift_type == RIGHT_SHIFT) {
-    legacy_host_radix_blocks_rotate_left<Torus>(
-        streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations,
-        num_blocks, big_lwe_size);
+    host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
+                                         rotated_buffer, lwe_array, rotations,
+                                         num_blocks, big_lwe_size);
    cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                 num_blocks * big_lwe_size_bytes, streams[0],
                                 gpu_indexes[0]);
@@ -200,7 +200,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
        size_t partial_block_count = num_blocks - rotations;
        auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

-        legacy_integer_radix_apply_bivariate_lookup_table_kb<Torus>(
+        integer_radix_apply_bivariate_lookup_table_kb<Torus>(
            streams, gpu_indexes, gpu_count, partial_current_blocks,
            partial_current_blocks, partial_next_blocks, bsks, ksks,
            partial_block_count, lut_bivariate,
@@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      }
      auto lut_univariate_padding_block =
          mem->lut_buffers_univariate[num_bits_in_block - 1];
-      legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+      integer_radix_apply_univariate_lookup_table_kb<Torus>(
          mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
          last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
      // Replace blocks 'pulled' from the left with the correct padding
@@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
      if (shift_within_block != 0) {
        auto lut_univariate_shift_last_block =
            mem->lut_buffers_univariate[shift_within_block - 1];
-        legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
+        integer_radix_apply_univariate_lookup_table_kb<Torus>(
            mem->local_streams_2, gpu_indexes, gpu_count, last_block,
            last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
      }
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cu
@@ -22,13 +22,15 @@ void scratch_cuda_integer_radix_shift_and_rotate_kb_64(

 void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaRadixCiphertextFFI *lwe_array, CudaRadixCiphertextFFI const *lwe_shift,
-    int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
+    void *lwe_array, void const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
+    void *const *ksks, uint32_t num_blocks) {

  host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
-      (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, lwe_shift,
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array),
+      static_cast<const uint64_t *>(lwe_shift),
      (int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
-      (uint64_t **)(ksks));
+      (uint64_t **)(ksks), num_blocks);
 }

 void cleanup_cuda_integer_radix_shift_and_rotate(void *const *streams,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -26,37 +26,22 @@ __host__ void scratch_cuda_integer_radix_shift_and_rotate_kb(
 template <typename Torus>
 __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
    cudaStream_t const *streams, uint32_t const *gpu_indexes,
-    uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
-    CudaRadixCiphertextFFI const *lwe_shift,
+    uint32_t gpu_count, Torus *lwe_array, Torus const *lwe_shift,
    int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
-    Torus *const *ksks) {
-  cuda_set_device(gpu_indexes[0]);
-
-  if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
-    PANIC("Cuda error: lwe_shift and lwe_array num radix blocks must be "
-          "the same")
-
-  if (lwe_array->lwe_dimension != lwe_shift->lwe_dimension)
-    PANIC("Cuda error: lwe_shift and lwe_array lwe_dimension must be "
-          "the same")
-
-  auto num_radix_blocks = lwe_array->num_radix_blocks;
-
+    Torus *const *ksks, uint32_t num_radix_blocks) {
  uint32_t bits_per_block = log2_int(mem->params.message_modulus);
  uint32_t total_nb_bits = bits_per_block * num_radix_blocks;
  if (total_nb_bits == 0)
    return;

  auto big_lwe_dimension = mem->params.big_lwe_dimension;
-
-  if (lwe_array->lwe_dimension != big_lwe_dimension)
-    PANIC("Cuda error: lwe_shift lwe_dimension must be equal to "
-          "big_lwe_dimension")
+  auto big_lwe_size = big_lwe_dimension + 1;
+  auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

  // Extract all bits
  auto bits = mem->tmp_bits;
  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
-                        ksks, num_radix_blocks * bits_per_block,
+                        ksks, num_radix_blocks, bits_per_block,
                        mem->bit_extract_luts);

  // Extract shift bits
@@ -76,14 +61,13 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  // Extracts bits and put them in the bit index 2 (=> bit number 3)
  // so that it is already aligned to the correct position of the cmux input
  // and we reduce noise growth
-  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits, lwe_shift,
-                        bsks, ksks, max_num_bits_that_tell_shift,
+  extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, shift_bits,
+                        (Torus *)lwe_shift, bsks, ksks, 1,
+                        max_num_bits_that_tell_shift,
                        mem->bit_extract_luts_with_offset_2);

  // If signed, do an "arithmetic shift" by padding with the sign bit
-  CudaRadixCiphertextFFI last_bit;
-  as_radix_ciphertext_slice<Torus>(&last_bit, bits, (total_nb_bits - 1),
-                                   total_nb_bits);
+  auto last_bit = bits + (total_nb_bits - 1) * big_lwe_size;

  // Apply op
  auto rotated_input = mem->tmp_rotated;
@@ -92,59 +76,60 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
  auto mux_lut = mem->mux_lut;
  auto mux_inputs = mem->tmp_mux_inputs;

-  copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], input_bits_a,
-                                     bits);
+  cuda_memcpy_async_gpu_to_gpu(input_bits_a, bits,
+                               total_nb_bits * big_lwe_size_bytes, streams[0],
+                               gpu_indexes[0]);
  for (int d = 0; d < max_num_bits_that_tell_shift; d++) {
-    CudaRadixCiphertextFFI shift_bit;
-    as_radix_ciphertext_slice<Torus>(&shift_bit, shift_bits, d, d + 1);
+    auto shift_bit = shift_bits + d * big_lwe_size;
+
+    cuda_memcpy_async_gpu_to_gpu(input_bits_b, input_bits_a,
+                                 total_nb_bits * big_lwe_size_bytes, streams[0],
+                                 gpu_indexes[0]);

-    copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], input_bits_b,
-                                       input_bits_a);
    auto rotations = 1 << d;
    switch (mem->shift_type) {
    case LEFT_SHIFT:
      // rotate right as the blocks are from LSB to MSB
-      if (input_bits_b->num_radix_blocks != total_nb_bits)
-        PANIC("Cuda error: incorrect number of blocks")
-      host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
-                                            rotated_input, input_bits_b,
-                                            rotations);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

-      set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
-                                                   rotated_input, 0, rotations);
+      if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
+        for (int i = 0; i < rotations; i++)
+          cuda_memcpy_async_gpu_to_gpu(rotated_input + i * big_lwe_size,
+                                       last_bit, big_lwe_size_bytes, streams[0],
+                                       gpu_indexes[0]);
+      else
+        cuda_memset_async(rotated_input, 0, rotations * big_lwe_size_bytes,
+                          streams[0], gpu_indexes[0]);
      break;
    case RIGHT_SHIFT:
      // rotate left as the blocks are from LSB to MSB
-      if (input_bits_b->num_radix_blocks != total_nb_bits)
-        PANIC("Cuda error: incorrect number of blocks")
-      host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
-                                           rotated_input, input_bits_b,
-                                           rotations);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);

      if (mem->is_signed)
-        for (int i = 0; i < rotations; i++) {
-          copy_radix_ciphertext_slice_async<Torus>(
-              streams[0], gpu_indexes[0], rotated_input,
-              total_nb_bits - rotations + i, total_nb_bits - rotations + i + 1,
-              &last_bit, 0, 1);
-        }
-      else {
-        set_zero_radix_ciphertext_slice_async<Torus>(
-            streams[0], gpu_indexes[0], rotated_input,
-            total_nb_bits - rotations, total_nb_bits);
-      }
+        for (int i = 0; i < rotations; i++)
+          cuda_memcpy_async_gpu_to_gpu(
+              rotated_input + (total_nb_bits - rotations + i) * big_lwe_size,
+              last_bit, big_lwe_size_bytes, streams[0], gpu_indexes[0]);
+      else
+        cuda_memset_async(
+            rotated_input + (total_nb_bits - rotations) * big_lwe_size, 0,
+            rotations * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
      break;
    case LEFT_ROTATE:
      // rotate right as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
-                                            rotated_input, input_bits_b,
-                                            rotations);
+      host_radix_blocks_rotate_right<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    case RIGHT_ROTATE:
      // rotate left as the blocks are from LSB to MSB
-      host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
-                                           rotated_input, input_bits_b,
-                                           rotations);
+      host_radix_blocks_rotate_left<Torus>(
+          streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
+          rotations, total_nb_bits, big_lwe_size);
      break;
    default:
      PANIC("Unknown operation")
@@ -152,52 +137,61 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(

    // host_pack bits into one block so that we have
    // control_bit|b|a
-    pack_bivariate_blocks<Torus>(
-        streams, gpu_indexes, gpu_count, (Torus *)mux_inputs->ptr,
-        mux_lut->lwe_indexes_out, (Torus *)rotated_input->ptr,
-        (Torus *)input_bits_a->ptr, mux_lut->lwe_indexes_in, big_lwe_dimension,
-        2, total_nb_bits);
+    pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
+                                 mux_lut->lwe_indexes_out, rotated_input,
+                                 input_bits_a, mux_lut->lwe_indexes_in,
+                                 big_lwe_dimension, 2, total_nb_bits);

    // The shift bit is already properly aligned/positioned
-    host_add_the_same_block_to_all_blocks<Torus>(
-        streams[0], gpu_indexes[0], mux_inputs, mux_inputs, &shift_bit);
+    for (int i = 0; i < total_nb_bits; i++)
+      host_addition<Torus>(streams[0], gpu_indexes[0],
+                           mux_inputs + i * big_lwe_size,
+                           mux_inputs + i * big_lwe_size, shift_bit,
+                           mem->params.big_lwe_dimension, 1);

    // we have
    // control_bit|b|a
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
        streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
-        mux_lut, total_nb_bits);
+        total_nb_bits, mux_lut);
  }

  // Initializes the output
  // Copy the last bit for each radix block
+  auto lwe_last_out = lwe_array;
+  last_bit = input_bits_a + (bits_per_block - 1) * big_lwe_size;
  for (int i = 0; i < num_radix_blocks; i++) {
-    auto last_bit_index = (bits_per_block - 1) + i * bits_per_block;
-    copy_radix_ciphertext_slice_async<Torus>(
-        streams[0], gpu_indexes[0], lwe_array, i, i + 1, input_bits_a,
-        last_bit_index, last_bit_index + 1);
+    cuda_memcpy_async_gpu_to_gpu(lwe_last_out, last_bit, big_lwe_size_bytes,
+                                 streams[0], gpu_indexes[0]);
+
+    lwe_last_out += big_lwe_size;
+    last_bit += bits_per_block * big_lwe_size;
  }

  // Bitshift and add the other bits
+  lwe_last_out = lwe_array;
  for (int i = bits_per_block - 2; i >= 0; i--) {
-    host_integer_small_scalar_mul_radix<Torus>(streams, gpu_indexes, gpu_count,
-                                               lwe_array, lwe_array, 2);
+
+    host_integer_small_scalar_mul_radix<Torus>(
+        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, 2,
+        big_lwe_dimension, num_radix_blocks);
+
+    auto block = lwe_last_out;
+    auto bit_to_add = input_bits_a + i * big_lwe_size;
+
    for (int j = 0; j < num_radix_blocks; j++) {
-      CudaRadixCiphertextFFI block;
-      CudaRadixCiphertextFFI bit_to_add;
-      as_radix_ciphertext_slice<Torus>(&block, lwe_array, j, j + 1);
-      as_radix_ciphertext_slice<Torus>(&bit_to_add, input_bits_a,
-                                       i + j * bits_per_block,
-                                       i + j * bits_per_block + 1);
-      host_addition<Torus>(streams[0], gpu_indexes[0], &block, &block,
-                           &bit_to_add, 1);
+      host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
+                           big_lwe_dimension, 1);
+
+      block += big_lwe_size;
+      bit_to_add += bits_per_block * big_lwe_size;
    }

    // To give back a clean ciphertext
    auto cleaning_lut = mem->cleaning_lut;
    integer_radix_apply_univariate_lookup_table_kb<Torus>(
-        streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
-        cleaning_lut, num_radix_blocks);
+        streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
+        num_radix_blocks, cleaning_lut);
  }
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
@@ -1,16 +1,20 @@
-#include "integer/integer.h"
 #include "linearalgebra/addition.cuh"

-void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
-                                       CudaRadixCiphertextFFI *output,
-                                       CudaRadixCiphertextFFI const *input_1,
-                                       CudaRadixCiphertextFFI const *input_2) {
+/*
+ * Perform the addition of two u32 input LWE ciphertext vectors.
+ * See the equivalent operation on u64 ciphertexts for more details.
+ */
+void cuda_add_lwe_ciphertext_vector_32(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

-  if (output->num_radix_blocks != input_1->num_radix_blocks ||
-      output->num_radix_blocks != input_2->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
-                          input_1, input_2, output->num_radix_blocks);
+  host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint32_t *>(lwe_array_out),
+                          static_cast<const uint32_t *>(lwe_array_in_1),
+                          static_cast<const uint32_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
@@ -18,14 +22,14 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
 * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
 * launch
 * - `gpu_index` is the index of the GPU to be used in the kernel launch
- * - `output` is an array of size
+ * - `lwe_array_out` is an array of size
 * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
 * been allocated on the GPU before calling this function, and that will hold
 * the result of the computation.
- * - `input_1` is the first LWE ciphertext vector used as input, it
+ * - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
 * should have been allocated and initialized before calling this function. It
 * has the same size as the output array.
- * - `input_2` is the second LWE ciphertext vector used as input, it
+ * - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
 * should have been allocated and initialized before calling this function. It
 * has the same size as the output array.
 * - `input_lwe_dimension` is the number of mask elements in the two input and
@@ -39,16 +43,17 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
 * vectors are left unchanged. This function is a wrapper to a device function
 * that performs the operation on the GPU.
 */
-void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
-                                       CudaRadixCiphertextFFI *output,
-                                       CudaRadixCiphertextFFI const *input_1,
-                                       CudaRadixCiphertextFFI const *input_2) {
+void cuda_add_lwe_ciphertext_vector_64(
+    void *stream, uint32_t gpu_index, void *lwe_array_out,
+    void const *lwe_array_in_1, void const *lwe_array_in_2,
+    const uint32_t input_lwe_dimension,
+    const uint32_t input_lwe_ciphertext_count) {

-  if (output->num_radix_blocks != input_1->num_radix_blocks ||
-      output->num_radix_blocks != input_2->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index, output,
-                          input_1, input_2, output->num_radix_blocks);
+  host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                          static_cast<uint64_t *>(lwe_array_out),
+                          static_cast<const uint64_t *>(lwe_array_in_1),
+                          static_cast<const uint64_t *>(lwe_array_in_2),
+                          input_lwe_dimension, input_lwe_ciphertext_count);
 }

 /*
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -8,7 +8,6 @@

 #include "device.h"
 #include "helper_multi_gpu.h"
-#include "integer/integer.h"
 #include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 #include <stdio.h>
@@ -51,7 +50,7 @@ __host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index,
                                      const uint32_t lwe_dimension,
                                      const uint32_t lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -72,7 +71,7 @@ __host__ void host_addition_plaintext_scalar(
    const T plaintext_input, const uint32_t lwe_dimension,
    const uint32_t lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -100,110 +99,13 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
 }

 // Coefficient-wise addition
-// num_radix_blocks selects the amount of blocks to be added from the inputs
 template <typename T>
-__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index,
-                            CudaRadixCiphertextFFI *output,
-                            CudaRadixCiphertextFFI const *input_1,
-                            CudaRadixCiphertextFFI const *input_2,
-                            uint32_t num_radix_blocks) {
-  if (output->lwe_dimension != input_1->lwe_dimension ||
-      output->lwe_dimension != input_2->lwe_dimension)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  if (output->num_radix_blocks < num_radix_blocks ||
-      input_1->num_radix_blocks < num_radix_blocks ||
-      input_2->num_radix_blocks < num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be larger or "
-          "equal to the num blocks to add")
+__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
+                            T const *input_1, T const *input_2,
+                            const uint32_t input_lwe_dimension,
+                            const uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = output->lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = num_radix_blocks * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  addition<T><<<grid, thds, 0, stream>>>(
-      static_cast<T *>(output->ptr), static_cast<const T *>(input_1->ptr),
-      static_cast<const T *>(input_2->ptr), num_entries);
-  check_cuda_error(cudaGetLastError());
-  for (uint i = 0; i < num_radix_blocks; i++) {
-    output->degrees[i] = input_1->degrees[i] + input_2->degrees[i];
-    output->noise_levels[i] =
-        input_1->noise_levels[i] + input_2->noise_levels[i];
-  }
-}
-
-template <typename T>
-__global__ void constant_addition(T *output, T const *input_1, T const *input_2,
-                                  uint32_t lwe_size, uint32_t num_entries) {
-
-  int tid = threadIdx.x;
-  int index = blockIdx.x * blockDim.x + tid;
-  if (index < num_entries) {
-    // Here we take advantage of the wrapping behaviour of uint
-    output[index] = input_1[index] + input_2[index % lwe_size];
-  }
-}
-
-// Coefficient-wise addition by the same block
-// input_with_multiple_blocks is a radix ciphertext with potentially multiple
-// blocks input_with_single_block is a radix ciphertext with a single block
-//
-// This function adds the single block in input_with_single_block to each block
-// in input_with_multiple_blocks. The result is written to output
-template <typename T>
-__host__ void host_add_the_same_block_to_all_blocks(
-    cudaStream_t stream, uint32_t gpu_index, CudaRadixCiphertextFFI *output,
-    CudaRadixCiphertextFFI const *input_with_multiple_blocks,
-    CudaRadixCiphertextFFI const *input_with_single_block) {
-  if (output->num_radix_blocks != input_with_multiple_blocks->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be the same")
-  if (input_with_single_block->num_radix_blocks != 1)
-    PANIC(
-        "Cuda error: input_with_single_block must be a single-block ciphertext")
-  if (output->lwe_dimension != input_with_multiple_blocks->lwe_dimension ||
-      output->lwe_dimension != input_with_single_block->lwe_dimension)
-    PANIC("Cuda error: input and output lwe dimensions must be the same")
-
-  cuda_set_device(gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = output->lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = output->num_radix_blocks * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  constant_addition<T><<<grid, thds, 0, stream>>>(
-      static_cast<T *>(output->ptr),
-      static_cast<const T *>(input_with_multiple_blocks->ptr),
-      static_cast<const T *>(input_with_single_block->ptr), lwe_size,
-      num_entries);
-  check_cuda_error(cudaGetLastError());
-  for (uint i = 0; i < output->num_radix_blocks; i++) {
-    output->degrees[i] = input_with_multiple_blocks->degrees[i] +
-                         input_with_single_block->degrees[0];
-    output->noise_levels[i] = input_with_multiple_blocks->noise_levels[i] +
-                              input_with_single_block->noise_levels[0];
-  }
-}
-
-// Coefficient-wise addition
-template <typename T>
-__host__ void legacy_host_addition(cudaStream_t stream, uint32_t gpu_index,
-                                   T *output, T const *input_1,
-                                   T const *input_2,
-                                   const uint32_t input_lwe_dimension,
-                                   const uint32_t input_lwe_ciphertext_count) {
-
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -239,7 +141,7 @@ __host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
                                            uint32_t input_lwe_ciphertext_count,
                                            uint32_t message_modulus) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -277,7 +179,7 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
                               uint32_t input_lwe_dimension,
                               uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -315,7 +217,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
                                         uint32_t input_lwe_dimension,
                                         uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  int num_blocks = 0, num_threads = 0;
  int num_entries = input_lwe_ciphertext_count;
  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -361,7 +263,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
    uint32_t input_lwe_ciphertext_count, uint32_t message_modulus,
    uint32_t carry_modulus, uint32_t degree) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -34,7 +34,7 @@ __host__ void host_cleartext_vec_multiplication(
    T const *cleartext_input, const uint32_t input_lwe_dimension,
    const uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
@@ -70,7 +70,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
                              uint32_t input_lwe_dimension,
                              uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -26,7 +26,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
                            T const *input, const uint32_t input_lwe_dimension,
                            const uint32_t input_lwe_ciphertext_count) {

-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  // lwe_size includes the presence of the body
  // whereas lwe_dimension is the number of elements in the mask
  int lwe_size = input_lwe_dimension + 1;
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cu
@@ -96,7 +96,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
                                 uint32_t total_polynomials) {

  auto stream = static_cast<cudaStream_t>(stream_v);
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  auto input1 = (double2 *)_input1;
  auto input2 = (double2 *)_input2;
  auto output = (double2 *)_output;
@@ -106,12 +106,10 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
  int gridSize = total_polynomials;
  int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-
  double2 *buffer;
  switch (polynomial_size) {
  case 256:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
@@ -132,7 +130,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
    }
    break;
  case 512:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
@@ -153,7 +151,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
    }
    break;
  case 1024:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
@@ -174,7 +172,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
    }
    break;
  case 2048:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
@@ -195,7 +193,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
    }
    break;
  case 4096:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
@@ -216,7 +214,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
    }
    break;
  case 8192:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
@@ -237,7 +235,7 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
    }
    break;
  case 16384:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
      check_cuda_error(cudaFuncSetAttribute(
          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -78,7 +78,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
                                                 double2 *dest, ST const *src,
                                                 uint32_t polynomial_size,
                                                 uint32_t total_polynomials) {
-  cuda_set_device(gpu_index);
+  cudaSetDevice(gpu_index);
  int shared_memory_size = sizeof(double) * polynomial_size;

  // Here the buffer size is the size of double2 times the number of polynomials
@@ -111,12 +111,10 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,

  cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);

-  int max_shared_memory = cuda_get_max_shared_memory(gpu_index);
-
  double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
  switch (polynomial_size) {
  case 256:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -134,7 +132,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 512:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -152,7 +150,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 1024:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -170,7 +168,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 2048:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -188,7 +186,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 4096:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -206,7 +204,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 8192:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -224,7 +222,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
    }
    break;
  case 16384:
-    if (shared_memory_size <= max_shared_memory) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
      check_cuda_error(cudaFuncSetAttribute(
          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Testé	cae938a75b	wip: measure latencies of a list of cts	2025-01-28 11:19:27 +01:00
David Testé	bae1d1cf77	WIP: fix gpu streams and use iter_batched	2025-01-22 10:56:08 +01:00
David Testé	a3bc1a9d9e	chore(bench): new heuristic to define elements for throughput This is done to fill up backend with enough elements to fill the backend and avoid having long execution time for heavy operations like multiplication or division.	2025-01-20 15:21:05 +01:00